diff --git a/R/get_csv_specs.R b/R/get_csv_specs.R new file mode 100755 index 0000000..1f6dd58 --- /dev/null +++ b/R/get_csv_specs.R @@ -0,0 +1,726 @@ +# - [X] absolute filepath +# - [X] encoding of the filepath +# - [X] file name +# - [X] file extension +# - [X] file size +# - [X] file is not empty +# - [X] file exists? +# - [X] file is readable? +# - [X] file is writable? +# - [X] file is not binary? +# - [X] number of rows +# - [X] does it have a newline at the end +# - [X] determine best guess encoding +# - [ ] determine if best guess encoding is used and then converted to UTF-8 there are non-UTF-8 characters +# - [X] determine delimiter +# - [X] determine decimal mark +# - [X] determine grouping mark +# - [X] determine number of columns +# - [ ] determine/guess if a header is included +# - [X] determine if column names match expectation +# - [ ] is investor_name column all strings +# - [ ] is portfolio_name column all strings +# - [ ] is isin column all strings +# - [X] is isin column all valid isins (Luhn) +# - [ ] is isin column all isins that are in our financial data +# - [ ] is market_value all numeric +# - [ ] is market_value all numeric after using determined decimal and grouping mark +# - [ ] is market value negative +# - [ ] is currency column all strings +# - [ ] is currency column all 3 character +# - [ ] is currency column all alpha strings +# - [ ] is currency column all uppercase +# - [x] is currency column all valid, current ISO 4217 alpha currency codes +# - [ ] is currency column all currency codes that exist in our currency exchange rate data + +get_csv_specs <- function(files, expected_colnames = c("Investor.Name", "Portfolio.Name", "ISIN", "MarketValue", "Currency")) { + alert_by_type <- function(type, ...) { + switch( + type, + info = cli::cli_alert_info(...), + warning = cli::cli_alert_warning(...), + danger = cli::cli_alert_danger(...), + success = cli::cli_alert_success(...), + ) + } + + report_alert_files <- function(msg, bullets, type = "info", info = NULL) { + cli::cli_div(theme = list(`.indented` = list(`margin-left` = 2), `.file` = list(color = "blue"))) + on.exit(cli::cli_end(), add = TRUE) + + cli::cli({ + alert_by_type(type, msg) + if (!is.null(info)) { + alert_by_type("info", info, class = "indented") + } + if (length(bullets) > 10L) { + abbreviated <- c(bullets[1:10], paste0("\u2026 and {.strong ", length(bullets) - 10, " more}")) + cli::cli_bullets(abbreviated, class = "file indented") + } else { + cli::cli_bullets(bullets, class = "file indented") + } + }) + } + + if (length(files) == 1 && fs::is_dir(files)) { + files <- file.path(files, list.files(files)) + } + files_df <- tibble::tibble(input = files, id = seq_along(files)) + + files_df$filename <- basename(files_df$input) + files_df$extension <- tools:::file_ext(files_df$input) + + files_df$filepath <- fs::path_abs(fs::path_expand(files_df$input)) + + files_df$file_exists <- unname(fs::file_exists(files_df$filepath)) + + if (all(files_df$file_exists)) { + cli::cli_alert_success("all files exist") + } else if (all(!files_df$file_exists)) { + cli::cli_alert_danger("none of the files exist") + cli::cli_abort("the {.fun get_csv_specs} function had to stop because none of the files could be used") + } else { + alert_files <- files_df$filename[!files_df$file_exists] + report_alert_files("the following files do not exist and will not be considered further:", alert_files, type = "danger") + files_df <- files_df[files_df$file_exists, ] + } + + files_df$file_size <- unname(fs::file_size(files_df$filepath)) + + if (all(files_df$file_size > 0)) { + cli::cli_alert_success("all files have a size > 0") + } else if (all(files_df$file_size == 0)) { + cli::cli({ + cli::cli_alert_danger("all of the files have a size of 0") + cli::cli_alert_info("this might mean that they are un-downloaded Dropbox files or are empty files", class = "indented") + }) + cli::cli_abort("the {.fun get_csv_specs} function had to stop because none of the files could be used") + } else { + alert_files <- files_df$filename[files_df$file_size == 0] + report_alert_files("the following files have a size of 0 and will not be considered further:", alert_files, type = "danger", info = "this might mean that they are un-downloaded Dropbox files or are empty files") + files_df <- files_df[files_df$file_size > 0, ] + } + + + files_df$file_read_access <- unname(fs::file_access(files_df$filepath, mode = "read")) + + if (all(files_df$file_read_access)) { + cli::cli_alert_success("all files have read access") + } else if (all(!files_df$file_read_access)) { + cli::cli_alert_danger("all of the files do not have read access") + cli::cli_abort("the {.fun get_csv_specs} function had to stop because none of the files could be used") + } else { + alert_files <- files_df$filename[!files_df$file_read_access] + report_alert_files("the following files do not have read access and will not be considered further:", alert_files, type = "danger") + files_df <- files_df[files_df$file_read_access, ] + } + + + files_df$file_write_access <- unname(fs::file_access(files_df$filepath, mode = "write")) + + files_df$mime_encoding <- vapply(files_df$filepath, function(x) system2("file", args = c("-b", "--mime-encoding", shQuote(x)), stdout = TRUE), character(1)) + + if (all(files_df$mime_encoding != "binary")) { + cli::cli_alert_success("all files are not binary files") + } else if (all(files_df$mime_encoding == "binary")) { + cli::cli_alert_danger("all of the files are binary files") + cli::cli_abort("the {.fun get_csv_specs} function had to stop because none of the files could be used") + } else { + alert_files <- files_df$filename[files_df$mime_encoding == "binary"] + report_alert_files("the following files are binary files and will not be considered further:", alert_files, type = "danger") + files_df <- files_df[files_df$mime_encoding != "binary", ] + } + + + files_df$content_type <- guess_content_types(files_df$filepath) + + if (all(grepl("/csv$|/comma-separated-values$", files_df$content_type))) { + cli::cli_alert_success("all files are CSV files") + } else if (all(!grepl("/csv$|/comma-separated-values$", files_df$content_type))) { + cli::cli_alert_danger("all of the files are not CSV files") + cli::cli_abort("the {.fun get_csv_specs} function had to stop because none of the files could be used") + } else { + alert_files <- files_df$filename[!grepl("/csv$|/comma-separated-values$", files_df$content_type)] + report_alert_files("the following files are not CSV files and will not be considered further:", alert_files, type = "danger") + files_df <- files_df[grepl("/csv$|/comma-separated-values$", files_df$content_type), ] + } + + files_df$filepath_is_ascii <- stringi::stri_enc_isascii(files_df$filepath) + + detected_filepath_encoding_tbls <- stringi::stri_enc_detect(files_df$filepath) + files_df$filepath_encoding <- vapply(detected_filepath_encoding_tbls, function(x) { x$Encoding[[1]] }, character(1)) + + files_df$filepath_declared_encoding <- Encoding(files_df$filepath) + + # if (all(files_df$filepath_is_ascii)) { + # cli::cli_alert_success("all filepaths are in ASCII") + # } else { + # # cli::cli_alert_warning("the following filepaths are not in ASCII") + # # cli::cli_bullets(setNames(files_df$filepath[!files_df$filepath_is_ascii], " ")) + # cli::cli_text("the following filepaths are not in ASCII: {.file {files_df$filepath[!files_df$filepath_is_ascii]}}") + # } + + # if (any(files_df$filepath_encoding == "UTF-8")) { + # # cli::cli_alert_warning("the following filepaths are in UTF-8, which can cause trouble, especially on Windows") + # cli::cli_text("the following filepaths are in UTF-8, which can cause trouble, especially on Windows: {.file {files_df$filepath[!files_df$filepath_is_ascii]}}") + # } + + + files_df$file_encoding <- pacta.portfolio.analysis::guess_file_encoding(files_df$filepath) + + if (all(grepl("ascii|utf-8", files_df$file_encoding, ignore.case = TRUE))) { + cli::cli_alert_success("all files are encoded in ASCII or UTF-8") + } else { + alert_files <- files_df$filename[!grepl("ascii|utf-8", files_df$file_encoding, ignore.case = TRUE)] + report_alert_files("the following files are not encoded in ASCII or UTF-8:", alert_files, type = "warning", info = "this can be adapted to automatically by the {.fun read_portfolio_csv} function") + } + + files_df$num_of_lines <- guess_num_of_lines(files_df$filepath) + + files_df$last_line_has_newline <- has_newline_at_end(files_df$filepath) + + if (all(files_df$last_line_has_newline)) { + cli::cli_alert_success("all files have a newline at the end") + } else { + alert_files <- files_df$filename[!files_df$last_line_has_newline] + report_alert_files("the following files do not have a newline at the end:", alert_files, type = "warning", info = "this can be adapted to automatically by the {.fun read_portfolio_csv} function") + } + + files_df$delimiter <- pacta.portfolio.analysis::guess_delimiter(files_df$filepath) + + if (all(files_df$delimiter == ",")) { + cli::cli_alert_success(paste0("all files use {.strong ", cli::style_inverse(","), "} for a delimiter")) + } else { + alert_files <- files_df$filename[files_df$delimiter != ","] + report_alert_files(paste0("the following files do not use {.strong ", cli::style_inverse(","), "} for a delimiter:"), alert_files, type = "warning", info = "this can be adapted to automatically by the {.fun read_portfolio_csv} function") + } + + files_df$read_without_error <- validate_read_without_error(files_df$filepath, files_df$file_encoding, files_df$delimiter) + + if (all(files_df$read_without_error == TRUE)) { + cli::cli_alert_success(paste0("all files can be read without error")) + } else if (any(files_df$read_without_error == FALSE)) { + alert_files <- files_df$filename[isFALSE(files_df$read_without_error)] + report_alert_files("the following files can not be read without error and will not be considered further:", alert_files, type = "danger") + files_df <- files_df[files_df$read_without_error == TRUE, ] + } + + + files_df$num_of_columns <- guess_num_of_columns(files_df$filepath, files_df$file_encoding, files_df$delimiter) + + if (all(files_df$num_of_columns == 5L)) { + cli::cli_alert_success(paste0("all files have {.strong 5} columns")) + } else if (any(files_df$num_of_columns > 5L)) { + alert_files <- files_df$filename[files_df$num_of_columns > 5L] + report_alert_files("the following files have more than {.strong 5} columns:", alert_files, type = "warning", info = "this can usually be adapted to automatically by the {.fun read_portfolio_csv} function") + } else if (any(files_df$num_of_columns < 4L)) { + alert_files <- files_df$filename[files_df$num_of_columns < 4L] + report_alert_files("the following files have less than {.strong 4} columns and will not be considered further:", alert_files, type = "danger") + files_df <- files_df[files_df$num_of_columns >= 4L, ] + } + + + + files_df$decimal_mark <- pacta.portfolio.analysis:::guess_decimal_mark(files_df$filepath) + + if (all(!is.na(files_df$decimal_mark)) && all(files_df$decimal_mark == ".")) { + cli::cli_alert_success(paste0("all files use {.strong ", cli::style_inverse("."), "} for a decimal mark")) + } else { + alert_files <- files_df$filename[is.na(files_df$decimal_mark) || grepl("^[.]$", files_df$decimal_mark)] + report_alert_files(paste0("the following files do not use {.strong ", cli::style_inverse("."), "} for a decimal mark"), alert_files, type = "warning", info = "this can be adapted to automatically by the {.fun read_portfolio_csv} function") + } + + files_df$grouping_mark <- pacta.portfolio.analysis:::guess_grouping_mark(filepaths = files_df$filepath) + + if (all(!is.na(files_df$grouping_mark) && files_df$grouping_mark == ",")) { + cli::cli_alert_success(paste0("all files use {.strong ", cli::style_inverse(","), "} for a grouping mark")) + } else { + alert_files <- files_df$filename[is.na(files_df$grouping_mark) || grepl("^[,]$", files_df$grouping_mark)] + report_alert_files(paste0("the following files do not use {.strong ", cli::style_inverse(","), "} for a grouping mark"), alert_files, type = "warning", info = "this can be adapted to automatically by the {.fun read_portfolio_csv} function") + } + + files_df$tokenizer <- get_tokenizers(files_df$filepath, files_df$file_encoding, files_df$delimiter) + + files_df$fields_per_line <- get_fields_per_line(files_df$filepath, files_df$tokenizer) + files_df$has_consistent_fields_per_line <- vapply(X = files_df$fields_per_line, FUN = function(x) all(x == x[1]), FUN.VALUE = logical(1)) + + if (all(files_df$has_consistent_fields_per_line == TRUE)) { + cli::cli_alert_success(paste0("all files have a consistent number of fields per line")) + } else if (any(files_df$has_consistent_fields_per_line == FALSE)) { + alert_files <- files_df$filename[files_df$has_consistent_fields_per_line == FALSE] + report_alert_files("the following files do not have a consistent number of fields per line and will not be considered further:", alert_files, type = "danger") + files_df <- files_df[files_df$has_consistent_fields_per_line == TRUE, ] + } + + files_df$column_names <- get_column_names(files_df$filepath, files_df$file_encoding, files_df$delimiter) + + files_df$has_expected_colnames <- vapply(X = files_df$column_names, FUN = function(x) isTRUE(all.equal(target = expected_colnames, current = x)), FUN.VALUE = logical(1)) + + if (all(files_df$has_expected_colnames == TRUE)) { + cli::cli_alert_success(paste0("all files have the expected column names")) + } else if (all(files_df$has_expected_colnames == FALSE)) { + cli::cli_alert_warning("none of the files have the expected column names") + } else if (any(files_df$has_expected_colnames == FALSE)) { + alert_files <- files_df$filename[files_df$has_expected_colnames == FALSE] + report_alert_files("the following files do not have the expected column names:", alert_files, type = "warning") + } + + files_df$readr_locale <- get_locales(encodings = files_df$file_encoding, decimal_marks = files_df$decimal_mark, grouping_marks = files_df$grouping_mark) + + investor_name_colname <- expected_colnames[[1]] + portfolio_name_colname <- expected_colnames[[2]] + isin_colname <- expected_colnames[[3]] + market_value_colname <- expected_colnames[[4]] + currency_colname <- expected_colnames[[5]] + test <- purrr::map(seq_along(files_df$filepath), ~ suppressWarnings(suppressMessages(readr::read_delim(files_df$filepath[.x], delim = files_df$delimiter[.x], locale = files_df$readr_locale[.x][[1]], progress = FALSE, show_col_types = FALSE)))) + + files_df$investor_name_is_string <- vapply(test, function(x) is.character(x[[investor_name_colname]]), logical(1)) + + files_df$portfolio_name_is_string <- vapply(test, function(x) is.character(x[[portfolio_name_colname]]), logical(1)) + + files_df$market_value_is_numeric <- vapply(test, function(x) is.numeric(x[[market_value_colname]]), logical(1)) + + files_df$market_value_has_negatives <- vapply(test, function(x) any(x[[market_value_colname]] < 0), logical(1)) + + files_df$market_value_has_nas <- vapply(test, function(x) any(is.na(x[[market_value_colname]])), logical(1)) + + files_df$valid_iso4217c_codes <- vapply(test, function(x) all(is_valid_currency_code(x[[currency_colname]])), FUN.VALUE = logical(1), USE.NAMES = FALSE) + + if (all(files_df$valid_iso4217c_codes == TRUE)) { + cli::cli_alert_success(paste0("all files have only valid iso4217c currency codes")) + } else if (any(files_df$valid_iso4217c_codes == FALSE)) { + alert_files <- files_df$filename[files_df$valid_iso4217c_codes == FALSE] + report_alert_files("the following files have some invalid iso4217c currency codes:", alert_files, type = "warning") + } + + files_df$has_invalid_isins <- vapply(test, function(x) any(is_valid_isin(x[[isin_colname]]) == FALSE), FUN.VALUE = logical(1), USE.NAMES = FALSE) + + if (all(files_df$has_invalid_isins == FALSE)) { + cli::cli_alert_success(paste0("all files have only valid ISINs")) + } else if (any(files_df$has_invalid_isins == TRUE)) { + alert_files <- files_df$filename[files_df$has_invalid_isins == TRUE] + report_alert_files("the following files have some invalid ISINs:", alert_files, type = "warning") + } + + + invisible(files_df) +} + + +has_newline_at_end <- function(filepaths) { + filepaths <- pacta.portfolio.analysis:::simplify_if_one_col_df(filepaths) + stopifnot("`filepaths` must be a character vector" = typeof(filepaths) == "character") + filepaths <- pacta.portfolio.analysis:::canonize_path(filepaths) + + vapply( + X = filepaths, + FUN = function(filepath) { + if (!is_file_accessible(filepath) || !is_text_file(filepath)) { + return(NA) + } + + con <- file(filepath, "rb") + on.exit(close(con)) + + not_at_end <- TRUE + chars <- "" + while (not_at_end) { + prev_chars <- chars + chars <- readChar(con, nchars = 2048L, useBytes = TRUE) + if (length(chars) == 0L) { not_at_end <- FALSE } + } + + grepl("[\n\r]$", prev_chars) + }, + FUN.VALUE = logical(1), + USE.NAMES = FALSE + ) +} + + +guess_ <- function(filepaths, encodings) { + vapply( + X = seq_along(filepaths), + FUN = function(i) { + + }, + FUN.VALUE = character(1) + ) +} + + +check_column_names <- function(filepaths, encodings, delimiters) { + vapply( + X = seq_along(filepaths), + FUN = function(i) { + readr::read_delim(file = filepaths[[i]], locale = readr::locale(encoding = encodings[[i]]), delim = delimiters[[i]], n_max = 1L, col_names = FALSE, show_col_types = FALSE, progress = FALSE) + }, + FUN.VALUE = integer(1) + ) +} + +get_column_names <- function(filepaths, encodings, delimiters) { + vapply( + X = seq_along(filepaths), + FUN = function(i) { + list( + names( + suppressMessages( + readr::read_delim( + file = filepaths[[i]], + delim = delimiters[[i]], + locale = readr::locale(encoding = encodings[[i]]), + n_max = 1L, + trim_ws = TRUE, + col_types = readr::cols(.default = "c"), + show_col_types = FALSE, + progress = FALSE + ) + ) + ) + ) + }, + FUN.VALUE = list(1) + ) +} + + +get_fields_per_line <- function(filepaths, tokenizers) { + vapply( + X = seq_along(filepaths), + FUN = function(i) { + list( + readr::count_fields( + file = filepaths[[i]], + tokenizer = tokenizers[[i]] + ) + ) + }, + FUN.VALUE = list(1) + ) +} + + +get_locales <- function(encodings, decimal_marks, grouping_marks) { + vapply( + X = seq_along(encodings), + FUN = function(i) { + if (is.na(decimal_marks[[i]])) { decimal_marks[[i]] <- "."} + if (is.na(grouping_marks[[i]])) { grouping_marks[[i]] <- ","} + list( + readr::locale( + encoding = encodings[[i]], + decimal_mark = decimal_marks[[i]], + grouping_mark = grouping_marks[[i]] + ) + ) + }, + FUN.VALUE = list(1) + ) +} + + +get_tokenizers <- function(filepaths, encodings, delimiters) { + vapply( + X = seq_along(filepaths), + FUN = function(i) { + list( + readr::tokenizer_delim( + delim = delimiters[[i]], + quote = "\"", + na = "NA", + quoted_na = TRUE, + comment = "", + trim_ws = TRUE, + escape_double = TRUE, + escape_backslash = FALSE, + skip_empty_rows = TRUE + ) + ) + }, + FUN.VALUE = list(1) + ) +} + + +guess_content_types <- function(filepaths) { + vapply( + X = filepaths, + FUN = function(x) { + wand::guess_content_type(x)[[1]] + }, + FUN.VALUE = character(1)) +} + + +# guess_encodings <- function(filepaths) { +# file_encodings <- +# vapply(X = filepaths, +# FUN = function(x) { +# readr::guess_encoding(file = x, n_max = -1, threshold = 0)$encoding[[1]] +# }, +# FUN.VALUE = character(1), +# USE.NAMES = FALSE) +# +# raw_lines <- lapply(X = filepaths, FUN = readr::read_lines_raw) +# has_cp850_chars <- +# vapply( +# X = raw_lines, +# FUN = function(x) { +# any( +# vapply( +# X = x, +# FUN = function(y) { +# any(y == as.raw(0x94)) +# }, +# FUN.VALUE = logical(1) +# ) +# ) +# }, +# FUN.VALUE = logical(1) +# ) +# +# ifelse(has_cp850_chars, "cp850", file_encodings) +# } + + +# guess_decimal_mark <- function(filepath, encoding, delimiter) { +# decimal_mark <- ifelse(delimiter == ";", ",", ".") +# grouping_mark <- ifelse(decimal_mark == ",", ".", ",") +# +# cust_locale <- +# readr::locale( +# decimal_mark = decimal_mark, +# grouping_mark = grouping_mark, +# encoding = encoding +# ) +# +# char_data <- +# readr::read_delim( +# file = filepath, +# delim = delimiter, +# locale = cust_locale, +# trim_ws = TRUE, +# col_types = readr::cols(.default = "c"), +# col_names = TRUE, +# progress = FALSE +# ) +# +# all_num_chars <- char_data[[4]] +# +# grp_mrk_com_regex <- "^((?![,]).)*$|[,][[:digit:]]{3}[^[:digit:]]|[,][[:digit:]]{3}$" +# grp_mrk_dot_regex <- "^((?![.]).)*$|[.][[:digit:]]{3}[^[:digit:]]|[.][[:digit:]]{3}$" +# +# has_comma <- any(grepl("[,]", all_num_chars)) +# has_dot <- any(grepl("[.]", all_num_chars)) +# +# comma_only_before_3 <- all(has_comma, grepl(grp_mrk_com_regex, all_num_chars, perl = TRUE)) +# dot_only_before_3 <- all(has_dot, grepl(grp_mrk_dot_regex, all_num_chars, perl = TRUE)) +# +# nums_have_space <- any(grepl(" ", char_data[[4]])) +# +# if (comma_only_before_3 && !dot_only_before_3) { +# decimal_mark <- "." +# } else if (dot_only_before_3 && !comma_only_before_3) { +# decimal_mark <- "," +# } else if (has_comma && !comma_only_before_3 && !dot_only_before_3) { +# decimal_mark <- "," +# } else if (has_dot && !dot_only_before_3 && !comma_only_before_3) { +# decimal_mark <- "." +# } +# +# decimal_mark +# } +# +# +# guess_decimal_marks <- function(filepaths, encodings, delimiters) { +# vapply( +# X = seq_along(filepaths), +# FUN = function(i) { +# guess_decimal_mark( +# filepath = filepaths[[i]], +# encoding = encodings[[i]], +# delimiter = delimiters[[i]] +# ) +# }, +# FUN.VALUE = character(1) +# ) +# } + + +# guess_delimiters <- function(filepaths, encodings) { +# vapply( +# X = seq_along(filepaths), +# FUN = function(i) { +# line1 <- readr::read_lines(file = filepaths[[i]], locale = readr::locale(encoding = encodings[[i]]), n_max = 1, progress = FALSE) +# commas <- stringr::str_count(line1, ",") +# semicolons <- stringr::str_count(line1, ";") +# delim <- ifelse(commas > semicolons, ",", ";") +# }, +# FUN.VALUE = character(1) +# ) +# } + + +# guess_grouping_mark <- function(filepath, encoding, delimiter) { +# decimal_mark <- ifelse(delimiter == ";", ",", ".") +# grouping_mark <- ifelse(decimal_mark == ",", ".", ",") +# +# cust_locale <- +# readr::locale( +# decimal_mark = decimal_mark, +# grouping_mark = grouping_mark, +# encoding = encoding +# ) +# +# char_data <- +# readr::read_delim( +# file = filepath, +# delim = delimiter, +# locale = cust_locale, +# trim_ws = TRUE, +# col_types = readr::cols(.default = "c"), +# col_names = TRUE, +# progress = FALSE +# ) +# +# all_num_chars <- char_data[[4]] +# +# grp_mrk_com_regex <- "^((?![,]).)*$|[,][[:digit:]]{3}[^[:digit:]]|[,][[:digit:]]{3}$" +# grp_mrk_dot_regex <- "^((?![.]).)*$|[.][[:digit:]]{3}[^[:digit:]]|[.][[:digit:]]{3}$" +# +# has_comma <- any(grepl("[,]", all_num_chars)) +# has_dot <- any(grepl("[.]", all_num_chars)) +# +# comma_only_before_3 <- all(has_comma, grepl(grp_mrk_com_regex, all_num_chars, perl = TRUE)) +# dot_only_before_3 <- all(has_dot, grepl(grp_mrk_dot_regex, all_num_chars, perl = TRUE)) +# +# nums_have_space <- any(grepl(" ", char_data[[4]])) +# +# if (comma_only_before_3 && !dot_only_before_3) { +# grouping_mark <- "," +# decimal_mark <- "." +# } else if (dot_only_before_3 && !comma_only_before_3) { +# grouping_mark <- "." +# decimal_mark <- "," +# } else if (has_comma && !comma_only_before_3 && !dot_only_before_3) { +# grouping_mark <- "." +# decimal_mark <- "," +# } else if (has_dot && !dot_only_before_3 && !comma_only_before_3) { +# grouping_mark <- "," +# decimal_mark <- "." +# } +# +# if (nums_have_space) { grouping_mark <- " " } +# +# grouping_mark +# } +# +# +# guess_grouping_marks <- function(filepaths, encodings, delimiters) { +# vapply( +# X = seq_along(filepaths), +# FUN = function(i) { +# guess_grouping_mark( +# filepath = filepaths[[i]], +# encoding = encodings[[i]], +# delimiter = delimiters[[i]] +# ) +# }, +# FUN.VALUE = character(1) +# ) +# } + + +guess_num_of_columns <- function(filepaths, encodings, delimiters) { + vapply( + X = seq_along(filepaths), + FUN = function(i) { + ncol(readr::read_delim(file = filepaths[[i]], locale = readr::locale(encoding = encodings[[i]]), delim = delimiters[[i]], n_max = 1L, col_names = FALSE, show_col_types = FALSE, progress = FALSE)) + }, + FUN.VALUE = integer(1) + ) +} + + +guess_num_of_lines <- function(filepaths) { + vapply( + X = filepaths, + FUN = function(filepath) { + length(readr::read_lines(filepath, n_max = -1L, lazy = TRUE, progress = FALSE)) + }, + FUN.VALUE = integer(1) + ) +} + + +# validate_isins <- function(isins) { +# is_luhn <- function(x) { +# digits <- suppressWarnings(as.numeric(rev(unlist(strsplit(x, ""))))) +# odd <- seq_along(digits) %% 2 == 1 +# s1 <- sum(digits[odd]) +# s2 <- digits[!odd] * 2 +# s2 <- sum(s2 %% 10 + s2 %/% 10) +# sum(s1, s2) %% 10 == 0 +# } +# +# isins <- toupper(isins) +# isins <- gsub(pattern = "[[:blank:]]", replacement = "", isins) +# valid_struct <- grepl("^[[:upper:]]{2}[[:alnum:]]{9}[[:digit:]]$", isins) +# +# valid_luhn <- +# vapply( +# X = isins, +# FUN = function(x) { +# x <- stringi::stri_replace_all_fixed(x, LETTERS, seq_along(LETTERS) + 9L, +# vectorize_all = FALSE) +# out <- vapply(x, is_luhn, FUN.VALUE = logical(1L), USE.NAMES = FALSE) +# out[is.na(out)] <- FALSE +# out +# }, +# FUN.VALUE = logical(1), +# USE.NAMES = FALSE +# ) +# +# valid_struct & valid_luhn +# } + + +# validate_isins_list <- function(isins_list) { +# if (class(isins_list) != "list") isins_list <- list(isins_list) +# lapply(X = isins_list, FUN = validate_isins) +# } +# +# +# validate_iso4217c <- function(codes_list) { +# if (!is.list(codes_list)) codes_list <- list(codes_list) +# vapply( +# X = codes_list, +# FUN = function(codes) { +# all(toupper(unique(codes)) %in% countrycode::codelist$iso4217c) +# }, +# FUN.VALUE = logical(1) +# ) +# } + + +validate_read_without_error <- function(filepaths, encodings, delimiters) { + vapply( + X = seq_along(filepaths), + FUN = function(i) { + out <- tryCatch( + suppressWarnings(suppressMessages( + readr::read_delim( + file = filepaths[[i]], + delim = delimiters[[i]], + locale = readr::locale(encoding = encodings[[i]]), + progress = FALSE, + show_col_types = FALSE + ) + )) + ) + !any(class(out) == "error") + }, + FUN.VALUE = logical(1) + ) +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..0244a6a --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Meta Report Data Creator + +## Instructions + +In each of the R files (`phase-1`, `pahse-2`, and `phase-3`), update the lines defining: + +* `data_path <- ` +* `output_dir <- ` + +Then run +```bash +Rscript phase-1_combine-portfolios.R +``` + +If you encounter errors, you will need to modify the portfolio `csv` files to correct the errors. + + +Then run +```bash +Rscript phase-2_run-pacta.R +``` + +which takes a long time. + +```bash +Rscript phase-3_combine-results.R +``` + +combines the results into peer files (meta + org in one set, user_id in the other) diff --git a/phase-1_combine-portfolios.R b/phase-1_combine-portfolios.R new file mode 100755 index 0000000..1373aba --- /dev/null +++ b/phase-1_combine-portfolios.R @@ -0,0 +1,430 @@ +# load required packages ------------------------------------------------------- + +suppressPackageStartupMessages({ + require(R.utils, quietly = TRUE, warn.conflicts = FALSE) + require(tibble, quietly = TRUE) + require(fs, quietly = TRUE) + require(cli, quietly = TRUE) + require(stringi, quietly = TRUE) + require(wand, quietly = TRUE) + require(stringr, quietly = TRUE) + require(pacta.portfolio.analysis, quietly = TRUE) # must install with # devtools::install_github("RMI-PACTA/pacta.portfolio.analysis") + + library(dplyr, warn.conflicts = FALSE) + library(devtools, quietly = TRUE, warn.conflicts = FALSE) + library(purrr) + library(stringr) + library(fs) + library(r2dii.utils) # must install with # devtools::install_github("2DegreesInvesting/r2dii.utils") + library(readr) + library(yaml) + library(config) + library(here) +}) + +cfg <- config::get(file = commandArgs(trailingOnly = TRUE)) + +source(here::here("R", "get_csv_specs.R")) + + +# manually set certain values and paths ---------------------------------------- + +# WARNING!!! These filepaths are easy to mess up. You're much better off +# copy-pasting them from your filesystem rather than trying to manually edit +# bits of it. Seriously. Trust me. + +output_dir <- cfg$output_dir # this will likely not work on Windows, so change it! +combined_portfolio_results_output_dir <- file.path(output_dir, "combined", "portfolio_level") +combined_user_results_output_dir <- file.path(output_dir, "combined", "user_level") +combined_orgtype_results_output_dir <- file.path(output_dir, "combined", "orgtype_level") + +data_path <- cfg$data_path +portfolios_path <- file.path(data_path, "portfolios") +portfolios_meta_csv <- file.path(data_path, "portfolios.csv") +users_meta_csv <- file.path(data_path, "users.csv") + +project_code <- cfg$project_code +default_language <- cfg$default_language + +project_prefix <- cfg$project_prefix +holdings_date <- cfg$holdings_date + +bogus_csvs_to_be_ignored <- cfg$bogus_csvs_to_be_ignored # if none, this should be c() +if (is.null(bogus_csvs_to_be_ignored)) {bogus_csvs_to_be_ignored <- c()} +users_to_be_ignored <- cfg$users_to_be_ignored # if none, this should be c() +if (is.null(users_to_be_ignored)) {users_to_be_ignored <- c()} + + +# check paths and directories -------------------------------------------------- + +dir.create(output_dir, showWarnings = FALSE) +dir.create(combined_portfolio_results_output_dir, showWarnings = FALSE, recursive = TRUE) +dir.create(file.path(combined_portfolio_results_output_dir, "30_Processed_inputs"), showWarnings = FALSE) +dir.create(file.path(combined_portfolio_results_output_dir, "40_Results"), showWarnings = FALSE) +dir.create(combined_user_results_output_dir, showWarnings = FALSE, recursive = TRUE) +dir.create(file.path(combined_user_results_output_dir, "30_Processed_inputs"), showWarnings = FALSE) +dir.create(file.path(combined_user_results_output_dir, "40_Results"), showWarnings = FALSE) +dir.create(combined_orgtype_results_output_dir, showWarnings = FALSE, recursive = TRUE) +dir.create(file.path(combined_orgtype_results_output_dir, "30_Processed_inputs"), showWarnings = FALSE) +dir.create(file.path(combined_orgtype_results_output_dir, "40_Results"), showWarnings = FALSE) + +stopifnot(dir.exists(output_dir)) +stopifnot(dir.exists(portfolios_path)) +stopifnot(file.exists(portfolios_meta_csv)) +stopifnot(file.exists(users_meta_csv)) +stopifnot(dir.exists(combined_portfolio_results_output_dir)) +stopifnot(dir.exists(file.path(combined_portfolio_results_output_dir, "30_Processed_inputs"))) +stopifnot(dir.exists(file.path(combined_portfolio_results_output_dir, "40_Results"))) +stopifnot(dir.exists(combined_user_results_output_dir)) +stopifnot(dir.exists(file.path(combined_user_results_output_dir, "30_Processed_inputs"))) +stopifnot(dir.exists(file.path(combined_user_results_output_dir, "40_Results"))) +stopifnot(dir.exists(combined_orgtype_results_output_dir)) +stopifnot(dir.exists(file.path(combined_orgtype_results_output_dir, "30_Processed_inputs"))) +stopifnot(dir.exists(file.path(combined_orgtype_results_output_dir, "40_Results"))) + + +# set needed values + +pacta_directories <- c("00_Log_Files", "10_Parameter_File", "20_Raw_Inputs", "30_Processed_Inputs", "40_Results", "50_Outputs") + + +# prepare a list of all the CSVs to import ------------------------------------- + +portfolio_csvs <- list.files(portfolios_path, pattern = "[.]csv$", full.names = TRUE) + + +# read in meta data CSVs ------------------------------------------------------- + +portfolios_meta <- read_csv(portfolios_meta_csv, show_col_types = FALSE) +users_meta <- read_csv(users_meta_csv, show_col_types = FALSE) + +if ("organization_type.id" %in% names(users_meta)) { + users_meta <- select(users_meta, id, organization_type = organization_type.id) +} else { + users_meta <- select(users_meta, id, organization_type) +} + + +# remove child portfolios ------------------------------------------------- + +child_ids <- portfolios_meta$id[!is.na(portfolios_meta$parent)] +portfolio_csvs <- portfolio_csvs[!tools::file_path_sans_ext(basename(portfolio_csvs)) %in% child_ids] + + +# remove unsubmitted CSVs ------------------------------------------------------ + +unsubmitted_ids <- portfolios_meta$id[portfolios_meta$submitted == 0] +portfolio_csvs <- portfolio_csvs[!tools::file_path_sans_ext(basename(portfolio_csvs)) %in% unsubmitted_ids] + + +# remove bogus CSVs ------------------------------------------------------------ + +portfolio_csvs <- portfolio_csvs[! tools::file_path_sans_ext(basename(portfolio_csvs)) %in% bogus_csvs_to_be_ignored] + + +# read in all the specs and remove unusable CSVs ------------------------------- + +specs <- get_csv_specs(portfolio_csvs) +saveRDS(specs, file.path(output_dir, paste0(project_prefix, "_csv_specs.rds"))) +portfolio_csvs <- specs$filepath + + +# read in all the CSVs --------------------------------------------------------- + +data <- read_portfolio_csv(portfolio_csvs) + + +# add meta data to full data and save it --------------------------------------- + +data <- + data %>% + mutate(port_id = suppressWarnings(as.numeric(tools::file_path_sans_ext(basename(filepath))))) %>% + left_join(portfolios_meta[, c("id", "user_id")], by = c(port_id = "id")) %>% + left_join(users_meta[, c("id", "organization_type")], by = c(user_id = "id")) + +data <- + data %>% + filter(!is.na(port_id)) %>% + filter(!is.na(user_id)) + +# remove users from analysis +data <- data %>% + filter(!(user_id %in% users_to_be_ignored)) + +# `write_csv()` sometimes fails on Windows and is not necessary, so commented out until solved +# write_csv(data, file = file.path(output_dir, paste0(project_prefix, "_full.csv"))) +saveRDS(data, file.path(output_dir, paste0(project_prefix, "_full.rds"))) + + +# prepare meta PACTA project --------------------------------------------------- + +meta_output_dir <- file.path(output_dir, "meta") +dir.create(meta_output_dir, showWarnings = FALSE) + +dir_create(file.path(meta_output_dir, pacta_directories)) + +data %>% + mutate(portfolio_name = "Meta Portfolio") %>% + mutate(investor_name = "Meta Investor") %>% + select(investor_name, portfolio_name, isin, market_value, currency) %>% + group_by_all() %>% ungroup(market_value) %>% + summarise(market_value = sum(market_value, na.rm = TRUE), .groups = "drop") %>% + write_csv(file = file.path(meta_output_dir, "20_Raw_Inputs", paste0(project_prefix, "_meta.csv"))) + +config_list <- + list( + default = list( + parameters = list( + portfolio_name = "Meta Portfolio", + investor_name = "Meta Investor", + peer_group = paste0(project_prefix, "_meta"), + language = default_language, + project_code = project_code, + holdings_date = holdings_date + ) + ) + ) +write_yaml(config_list, file = file.path(meta_output_dir, "10_Parameter_File", paste0(project_prefix, "_meta", "_PortfolioParameters.yml"))) + + +# slices for per user_id ------------------------------------------------------- + +users_output_dir <- file.path(output_dir, "user_id") +dir.create(users_output_dir, showWarnings = FALSE) + +all_user_ids <- unique(data$user_id) + +for (user_id in all_user_ids) { + user_data <- data %>% dplyr::filter(user_id == .env$user_id) + + investor_name <- encodeString(as.character(unique(user_data$investor_name))) + if (length(investor_name) > 1) { + investor_name <- investor_name[[1]] + user_data <- user_data %>% mutate(investor_name = .env$investor_name) + } + + user_data <- user_data %>% mutate(portfolio_name = .env$investor_name) + + peer_group <- unique(user_data$organization_type) + if (length(peer_group) > 1) { peer_group <- peer_group[[1]] } + + config_list <- + list( + default = list( + parameters = list( + portfolio_name = as.character(investor_name), + investor_name = as.character(investor_name), + peer_group = peer_group, + language = default_language, + project_code = project_code, + holdings_date = holdings_date + ) + ) + ) + + user_id_output_dir <- file.path(users_output_dir, paste0(project_prefix, "_user_", user_id)) + dir_create(file.path(user_id_output_dir, pacta_directories)) + + write_yaml(config_list, file = file.path(user_id_output_dir, "10_Parameter_File", paste0(project_prefix, "_user_", user_id, "_PortfolioParameters.yml"))) + + user_data %>% + select(investor_name, portfolio_name, isin, market_value, currency) %>% + group_by_all() %>% ungroup(market_value) %>% + summarise(market_value = sum(market_value, na.rm = TRUE), .groups = "drop") %>% + write_csv(file.path(user_id_output_dir, "20_Raw_Inputs", paste0(project_prefix, "_user_", user_id, ".csv"))) +} + + +# slices for per organization_type --------------------------------------------- + +orgs_output_dir <- file.path(output_dir, "organization_type") +dir.create(orgs_output_dir, showWarnings = FALSE) + +all_org_types <- unique(data$organization_type) + +for (org_type in all_org_types) { + org_data <- + data %>% + dplyr::filter(organization_type == .env$org_type) %>% + mutate(investor_name = .env$org_type) %>% + mutate(portfolio_name = .env$org_type) + + config_list <- + list( + default = list( + parameters = list( + portfolio_name = org_type, + investor_name = org_type, + peer_group = org_type, + language = default_language, + project_code = project_code, + holdings_date = holdings_date + ) + ) + ) + + org_type_output_dir <- file.path(orgs_output_dir, paste0(project_prefix, "_org_", org_type)) + dir_create(file.path(org_type_output_dir, pacta_directories)) + + write_yaml(config_list, file = file.path(org_type_output_dir, "10_Parameter_File", paste0(project_prefix, "_org_", org_type, "_PortfolioParameters.yml"))) + + org_data %>% + select(investor_name, portfolio_name, isin, market_value, currency) %>% + group_by_all() %>% ungroup(market_value) %>% + summarise(market_value = sum(market_value, na.rm = TRUE), .groups = "drop") %>% + write_csv(file.path(org_type_output_dir, "20_Raw_Inputs", paste0(project_prefix, "_org_", org_type, ".csv"))) +} + +# slices for per port_id ------------------------------------------------------- + +ports_output_dir <- file.path(output_dir, "port_id") +dir.create(ports_output_dir, showWarnings = FALSE) + +all_port_ids <- unique(data$port_id) + +for (port_id in all_port_ids) { + port_data <- + data %>% + dplyr::filter(port_id == .env$port_id) %>% + mutate(portfolio_name = as.character(.env$port_id)) + + portfolio_name <- encodeString(as.character(unique(port_data$portfolio_name))) + if (length(portfolio_name) > 1) { portfolio_name <- port_id } + + investor_name <- encodeString(as.character(unique(port_data$investor_name))) + if (length(investor_name) > 1) { investor_name <- investor_name[[1]] } + + peer_group <- unique(port_data$organization_type) + if (length(peer_group) > 1) { peer_group <- peer_group[[1]] } + + config_list <- + list( + default = list( + parameters = list( + portfolio_name = as.character(portfolio_name), + investor_name = as.character(investor_name), + peer_group = peer_group, + language = default_language, + project_code = project_code, + holdings_date = holdings_date + ) + ) + ) + + port_id_output_dir <- file.path(ports_output_dir, paste0(project_prefix, "_port_", port_id)) + dir_create(file.path(port_id_output_dir, pacta_directories)) + + write_yaml(config_list, file = file.path(port_id_output_dir, "10_Parameter_File", paste0(project_prefix, "_port_", port_id, "_PortfolioParameters.yml"))) + + port_data %>% + select(investor_name, portfolio_name, isin, market_value, currency) %>% + mutate(investor_name = .env$investor_name) %>% + mutate(portfolio_name = .env$portfolio_name) %>% + group_by_all() %>% ungroup(market_value) %>% + summarise(market_value = sum(market_value, na.rm = TRUE), .groups = "drop") %>% + write_csv(file.path(port_id_output_dir, "20_Raw_Inputs", paste0(project_prefix, "_port_", port_id, ".csv"))) +} + + +# slices for per port_id ------------------------------------------------------- + +ports_output_dir <- file.path(output_dir, "port_id") +dir.create(ports_output_dir, showWarnings = FALSE) + +all_port_ids <- unique(data$port_id) + +for (port_id in all_port_ids) { + port_data <- + data %>% + dplyr::filter(port_id == .env$port_id) %>% + mutate(portfolio_name = as.character(.env$port_id)) + + portfolio_name <- encodeString(as.character(unique(port_data$portfolio_name))) + if (length(portfolio_name) > 1) { portfolio_name <- port_id } + + investor_name <- encodeString(as.character(unique(port_data$investor_name))) + if (length(investor_name) > 1) { investor_name <- investor_name[[1]] } + + peer_group <- unique(port_data$organization_type) + if (length(peer_group) > 1) { peer_group <- peer_group[[1]] } + + config_list <- + list( + default = list( + parameters = list( + portfolio_name = as.character(portfolio_name), + investor_name = as.character(investor_name), + peer_group = peer_group, + language = default_language, + project_code = project_code, + holdings_date = holdings_date + ) + ) + ) + + port_id_output_dir <- file.path(ports_output_dir, paste0(project_prefix, "_port_", port_id)) + dir_create(file.path(port_id_output_dir, pacta_directories)) + + write_yaml(config_list, file = file.path(port_id_output_dir, "10_Parameter_File", paste0(project_prefix, "_port_", port_id, "_PortfolioParameters.yml"))) + + port_data %>% + select(investor_name, portfolio_name, isin, market_value, currency) %>% + mutate(investor_name = .env$investor_name) %>% + mutate(portfolio_name = .env$portfolio_name) %>% + group_by_all() %>% ungroup(market_value) %>% + summarise(market_value = sum(market_value, na.rm = TRUE), .groups = "drop") %>% + write_csv(file.path(port_id_output_dir, "20_Raw_Inputs", paste0(project_prefix, "_port_", port_id, ".csv"))) +} + + +# slices for per port_id ------------------------------------------------------- + +ports_output_dir <- file.path(output_dir, "port_id") +dir.create(ports_output_dir, showWarnings = FALSE) + +all_port_ids <- unique(data$port_id) + +for (port_id in all_port_ids) { + port_data <- + data %>% + dplyr::filter(port_id == .env$port_id) %>% + mutate(portfolio_name = as.character(.env$port_id)) + + portfolio_name <- encodeString(as.character(unique(port_data$portfolio_name))) + if (length(portfolio_name) > 1) { portfolio_name <- port_id } + + investor_name <- encodeString(as.character(unique(port_data$investor_name))) + if (length(investor_name) > 1) { investor_name <- investor_name[[1]] } + + peer_group <- unique(port_data$organization_type) + if (length(peer_group) > 1) { peer_group <- peer_group[[1]] } + + config_list <- + list( + default = list( + parameters = list( + portfolio_name = as.character(portfolio_name), + investor_name = as.character(investor_name), + peer_group = peer_group, + language = default_language, + project_code = project_code, + holdings_date = holdings_date + ) + ) + ) + + port_id_output_dir <- file.path(ports_output_dir, paste0(project_prefix, "_port_", port_id)) + dir_create(file.path(port_id_output_dir, pacta_directories)) + + write_yaml(config_list, file = file.path(port_id_output_dir, "10_Parameter_File", paste0(project_prefix, "_port_", port_id, "_PortfolioParameters.yml"))) + + port_data %>% + select(investor_name, portfolio_name, isin, market_value, currency) %>% + mutate(investor_name = .env$investor_name) %>% + mutate(portfolio_name = .env$portfolio_name) %>% + group_by_all() %>% ungroup(market_value) %>% + summarise(market_value = sum(market_value, na.rm = TRUE), .groups = "drop") %>% + write_csv(file.path(port_id_output_dir, "20_Raw_Inputs", paste0(project_prefix, "_port_", port_id, ".csv"))) +} + + diff --git a/phase-2_run-pacta-docker.R b/phase-2_run-pacta-docker.R new file mode 100644 index 0000000..3e61af3 --- /dev/null +++ b/phase-2_run-pacta-docker.R @@ -0,0 +1,79 @@ +library("dplyr") +library("tibble") +library("purrr") +library("tidyr") +library("fs") +library("here") +library("config") + + +cfg <- config::get(file = commandArgs(trailingOnly = TRUE)) + +output_dir <- cfg$output_dir # this will likely not work on Windows, so change it! +project_prefix <- cfg$project_prefix +group_types <- cfg$group_types + +pacta_directories <- c("00_Log_Files", "10_Parameter_File", "20_Raw_Inputs", "30_Processed_Inputs", "40_Results", "50_Outputs") + +if (is.null(group_types)){ + group_types <- c( + "meta", + "organization_type", + "user_id" # , + # "port_id" + ) +} + +all_paths <- tibble(type = setdiff(group_types, "meta")) %>% #so the meta 10/20/30 dirs don't get expanded + mutate(path = purrr::map(type, ~ list.dirs(file.path(output_dir, .x), recursive = FALSE))) %>% + tidyr::unnest(path) %>% + bind_rows(tibble(type = "meta", path = file.path(output_dir, "meta"))) %>% + mutate(portfolio_name_ref_all = case_when( + type == "meta" ~ paste0(project_prefix, "_meta"), + TRUE ~ basename(path) + )) %>% + mutate(type = factor(type, ordered = TRUE, levels = group_types)) %>% + arrange(type, portfolio_name_ref_all) %>% + filter(!portfolio_name_ref_all %in% c( + "" + )) + +script_path <- here("transitionmonitor_docker", "run-like-constructiva-flags.sh") +working_dir <- here("working_dir") +user_dir <- here("user_results") +dir_create(file.path(user_dir, "4")) +stopifnot(file.exists(script_path)) +stopifnot(dir.exists(user_dir)) + +for ( i in seq(1, nrow(all_paths)) ){ + this_row <- all_paths[i, ] + message(paste(Sys.time(), this_row$type, this_row$portfolio_name_ref_all, "--", i, "/", nrow(all_paths))) + message(paste(" ", this_row$path)) + these_dirs <- file.path(this_row$path, pacta_directories) + stopifnot(all(dir.exists(these_dirs))) + has_results <- (length(list.files(file.path(this_row$path, "40_Results"))) > 0) + if (has_results){ + message(" Results already exist, skipping") + } else { + message(" Running PACTA") + if (dir.exists(working_dir)) { + dir_delete(working_dir) + } + dir_create(working_dir) + portfolio_name_ref_all <- this_row$portfolio_name_ref_all + dir_copy(this_row$path, file.path(working_dir, portfolio_name_ref_all), overwrite = TRUE) + tic <- Sys.time() + system2( + command = script_path, + args = c( + paste0("-p ", "\"", this_row$portfolio_name_ref_all, "\""), + paste("-w", working_dir), + paste("-y", user_dir), + "-r /bound/bin/run-r-scripts-results-only" + ) + ) + message(paste(" ", format.difftime(Sys.time() - tic))) + dir_delete(this_row$path) + dir_copy(file.path(working_dir, portfolio_name_ref_all), this_row$path, overwrite = TRUE) + } +} diff --git a/phase-3_combine-results.R b/phase-3_combine-results.R new file mode 100644 index 0000000..a1e6d55 --- /dev/null +++ b/phase-3_combine-results.R @@ -0,0 +1,124 @@ +library("tidyverse") +library("config") + +cfg <- config::get(file = commandArgs(trailingOnly = TRUE)) + +output_dir <- cfg$output_dir +combined_portfolio_results_output_dir <- file.path(output_dir, "combined", "portfolio_level") +combined_user_results_output_dir <- file.path(output_dir, "combined", "user_level") +combined_orgtype_results_output_dir <- file.path(output_dir, "combined", "orgtype_level") + +data_path <- cfg$data_path +portfolios_path <- file.path(data_path, "portfolios") +portfolios_meta_csv <- file.path(data_path, "portfolios.csv") +users_meta_csv <- file.path(data_path, "users.csv") + +project_code <- cfg$project_code +default_language <- cfg$default_language + +project_prefix <- cfg$project_prefix +holdings_date <- cfg$holdings_date + +users <- read_csv(users_meta_csv) %>% + select( + user_id = "id", + type_id = "organization_type.id", + type = "organization_type.translationKey" + ) %>% mutate_all(as.character) + +data_filenames <- + c( + "Bonds_results_portfolio.rds", + "Bonds_results_company.rds", + "Bonds_results_map.rds", + "Equity_results_portfolio.rds", + #"Equity_results_company.rds", + "Equity_results_map.rds" + ) + +input_filenames <- c( + "overview_portfolio.rds", + "total_portfolio.rds", + "emissions.rds", + "audit_file.rds" + ) + +all_filenames <- tibble(filename = data_filenames, num_dir = "40_Results") %>% + bind_rows(tibble(filename = input_filenames, num_dir = "30_Processed_inputs")) + +#--combine org-level results-- + +org_paths <- tibble(path = list.files(file.path(output_dir, "organization_type"), recursive = TRUE, full.names = TRUE)) %>% +bind_rows(tibble(path = list.files(file.path(output_dir, "meta"), recursive = TRUE, full.names = TRUE))) %>% +mutate( + filename = basename(path), + filedir = dirname(path) +) %>% +filter(filename %in% all_filenames$filename) %>% +mutate(org = str_extract(string = as.character(filedir), pattern = "(?<=_org_)\\d+|meta")) %>% #lookahead group. https://stackoverflow.com/a/46788230 +left_join(distinct(select(users, type_id, type)), by = c("org" = "type_id")) %>% +print() + +for (j in seq(1, nrow(all_filenames))){ + this_filetype <- all_filenames[j, ] + file_paths <- org_paths %>% filter(filename == this_filetype$filename) + all_results <- NULL + for (i in seq(1, nrow(file_paths))){ + this_file <- file_paths[i, ] + message(paste("processing", this_file$filename, "for org", this_file$org, "--", i, "/", nrow(file_paths), "--", j, "/", nrow(all_filenames))) + content <- readRDS(this_file$path) %>% + mutate( + investor_name = this_file$type, + portfolio_name = this_file$type + ) + all_results <- bind_rows(all_results, content) + } + saveRDS(all_results, file.path(combined_orgtype_results_output_dir, this_filetype$num_dir, this_filetype$filename)) +} + +#--combine user-level results-- + +user_paths <- tibble(path = list.files(file.path(output_dir, "user_id"), recursive = TRUE, full.names = TRUE)) %>% +mutate( + filename = basename(path), + filedir = dirname(path) +) %>% +filter(filename %in% all_filenames$filename) %>% +mutate(user = str_extract(string = as.character(filedir), pattern = "(?<=_user_)\\d+")) %>% #lookahead group. https://stackoverflow.com/a/46788230 +left_join(distinct(select(users, user_id, type)), by = c("user" = "user_id")) %>% +print() + +for (j in seq(1, nrow(all_filenames))){ + this_filetype <- all_filenames[j, ] + file_paths <- user_paths %>% filter(filename == this_filetype$filename) + all_results <- NULL + for (i in seq(1, nrow(file_paths))){ + this_file <- file_paths[i, ] + message(paste("processing", this_file$filename, "for user", this_file$user, "--", i, "/", nrow(file_paths), "--", j, "/", nrow(all_filenames))) + message(paste(" ", nrow(all_results))) + content <- readRDS(this_file$path) %>% + mutate( + investor_name = this_file$type, + portfolio_name = this_file$user + ) + all_results <- bind_rows(all_results, content) + } + saveRDS(all_results, file.path(combined_user_results_output_dir, this_filetype$num_dir, this_filetype$filename)) +} + +file.copy( + from = file.path(combined_orgtype_results_output_dir, "40_Results", "Equity_results_portfolio.rds"), + to = file.path(output_dir, paste0(project_code, "_peers_equity_results_portfolio.rds")) +) +file.copy( + from = file.path(combined_orgtype_results_output_dir, "40_Results", "Bonds_results_portfolio.rds"), + to = file.path(output_dir, paste0(project_code, "_peers_bonds_results_portfolio.rds")) +) +file.copy( + from = file.path(combined_user_results_output_dir, "40_Results", "Equity_results_portfolio.rds"), + to = file.path(output_dir, paste0(project_code, "_peers_equity_results_portfolio_ind.rds")) +) +file.copy( + from = file.path(combined_user_results_output_dir, "40_Results", "Bonds_results_portfolio.rds"), + to = file.path(output_dir, paste0(project_code, "_peers_bonds_results_portfolio_ind.rds")) +) diff --git a/template.yml b/template.yml new file mode 100644 index 0000000..0977402 --- /dev/null +++ b/template.yml @@ -0,0 +1,16 @@ +--- +default: + output_dir: "" + data_path: "" + project_code: "PA2022XX" + default_language: "EN" + project_prefix: "PA2022XX" + holdings_date: "2021Q4" + bogus_csvs_to_be_ignored: + # yaml list + # - 1 + # - 2 + users_to_be_ignored: + # yaml list + # - 1 + # - 2 diff --git a/transitionmonitor_docker/run-like-constructiva-flags.sh b/transitionmonitor_docker/run-like-constructiva-flags.sh new file mode 100755 index 0000000..edee456 --- /dev/null +++ b/transitionmonitor_docker/run-like-constructiva-flags.sh @@ -0,0 +1,141 @@ +#! /bin/bash + +usage() { + echo "Usage: $0 -p " 1>&2 + echo "Optional flags:" 1>&2 + # t for tag + echo "[-t ] (default latest)" 1>&2 + echo "[-u ] (default 4)" 1>&2 + echo "[-m ] (default rmi_pacta)" 1>&2 + # a for pacta.portfolio.analysis + echo "[-a ] (default docker internal)" 1>&2 + # d for pacta-data + echo "[-d ] (default docker internal)" 1>&2 + # c for pacta.interactive.report + echo "[-c ] (default docker internal)" 1>&2 + # r for run + echo "[-r ] (default /bound/bin/run-r-scripts "portfolio name string")" 1>&2 + # x for architecture? + echo "[-x ] (default linux/x86_64)" 1>&2 + # w for working_dirs + echo "[-w ] (default ./working_dirs)" 1>&2 + # y for user_results + echo "[-y ] (default ./user_results)" 1>&2 + # v for verbose + echo "[-v] (verbose mode)" 1>&2 + echo "[-i] run container in interactive, tty mode (docker run -it)" 1>&2 + exit 1; +} + +while getopts p:t:u:a:d:c:d:r:x:w:y:m:vi flag +do + case "${flag}" in + u) userId=${OPTARG};; + p) portfolioIdentifier=${OPTARG};; + m) docker_image=${OPTARG};; + t) tag=${OPTARG};; + a) pa_repo=${OPTARG};; + d) data_repo=${OPTARG};; + c) cir_repo=${OPTARG};; + r) docker_command=${OPTARG};; + x) target_platform=${OPTARG};; + w) working_dirs=${OPTARG};; + y) user_results=${OPTARG};; + v) verbose=1;; + i) interactive=1;; + *) usage;; + esac +done + +if [ -z "${portfolioIdentifier}" ]; then + usage +fi + +if [ -z "${tag}" ]; then + tag="latest" +fi + +if [ -z "${userId}" ]; then + userId="4" +fi + +if [ -z "${target_platform}" ]; then + target_platform="linux/x86_64" +fi + +if [ -z "${working_dirs}" ]; then + working_dirs="$(pwd)"/working_dirs +fi + +if [ -z "${user_results}" ]; then + user_results="$(pwd)/user_results" +fi + +if [ -z "${docker_image}" ]; then + docker_image="rmi_pacta" +fi + +if [ -z "${docker_command}" ]; then + docker_command="/bound/bin/run-r-scripts" +fi + +yellow () { + printf "\033[33m$1\033[0m\n" +} + +resultsFolder="$working_dirs"/"$portfolioIdentifier" +userFolder="$user_results"/"$userId" + +args=( + "--rm" + --platform "$target_platform" + "--pull=never" + --network none + --user 1000:1000 + # "--memory=16g" +) + +if [ "${target_platform}" != "linux/arm64" ]; then + args+=("--memory-swappiness=0") +fi + +if [ -n "${interactive}" ]; then + args+=("-it") +fi + +if [ -n "${pa_repo}" ]; then + args+=(--mount "type=bind,source=${pa_repo},target=/bound") +fi + +if [ -n "${data_repo}" ]; then + args+=(--mount "type=bind,source=${data_repo},target=/pacta-data") +fi + +if [ -n "${cir_repo}" ]; then + args+=(--mount "type=bind,source=${cir_repo},target=/pacta.interactive.report") +fi + +args+=( + --mount "type=bind,source=${resultsFolder},target=/bound/working_dir" + --mount "type=bind,readonly,source=${userFolder},target=/user_results" +) +args+=("$docker_image:$tag") +if [ "$docker_command" = "bash" ]; then + args+=("$docker_command") +else + args+=("$docker_command" "$portfolioIdentifier") +fi + +if [ -n "${verbose}" ]; then + yellow "docker run \\ " + for arg in "${args[@]}"; do + yellow " $arg \\ " + done + yellow "" +fi + +echo Running Docker Container + +docker run "${args[@]}" + +exit 0