diff --git a/DESCRIPTION b/DESCRIPTION index 2da1472..72cfb82 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: LinkTools Type: Package Title: LinkTools -Version: 0.0.1.9003 -Date: 2023-02-07 +Version: 0.0.1.9004 +Date: 2023-04-19 Author: Christoph Leonhardt Maintainer: Christoph Leonhardt Description: This package facilitates the linkage of datasets via shared unique identifiers. Four steps are integrated into this package: a) the preparation of datasets which should be linked, i.e. the transformation into a comparable format and the assignment of shared unique identifiers, b) the merge of datasets based on these identifiers, c) the encoding or enrichment of the data with three output formats (data.table, XML or CWB). In addition, d), the package includes a wrapper for the Named Entity Linking of textual data based on DBPedia Spotlight. @@ -22,12 +22,16 @@ Imports: rhandsontable, stringr Suggests: + btmp, knitr, devtools, - DT + DT, + testthat (>= 3.0.0), + withr VignetteBuilder: knitr LazyData: yes License: GPL (>= 3) Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.2 +RoxygenNote: 7.2.3 +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 01c4e47..837dc04 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ importFrom(data.table,":=") importFrom(data.table,data.table) importFrom(data.table,is.data.table) importFrom(data.table,rleid) +importFrom(data.table,rleidv) importFrom(data.table,setnames) importFrom(data.table,setorder) importFrom(fuzzyjoin,fuzzy_join) diff --git a/NEWS.md b/NEWS.md index bbeb867..c6832b9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +[2023-04-19] +* addressed a quite comprehensive issue in `external_attribute_to_region_matrix()` that potentially obscured speakers which were not matched, making them unavailable for both the fuzzy matching and manual inspection (issue #14) +* introduced tests +* modified vignette, uses GERMAPARLMINI as sample data and the `btmp` package for linking +* removed data and data-raw which were containing the external data now provided by the `btmp` package +* made the addition more robust for subcorpora + +[2023-04-12] +* starting rework to address more diverging dataset-text data-combination +* adding the `additional_attributes` argument to `create_attribute_region_datatable()` to make manual inspection more meaningful +* made `check_and_add_missing_values()` more flexible by passing `check_for_groups` as a list and adding the `negate` argument +* added capability to use more than one fuzzy matched variable in `fuzzy_join_missing_values()` +* made more explicit use of fuzzyjoin::stringdist_join() + [2023-02-07] # v0.0.1.9003 * Added `Depends: R (>= 3.5.0)` to DESCRIPTION to avoid warning when build the package #2. diff --git a/R/LTDataset.R b/R/LTDataset.R index a72e6b1..88d9495 100644 --- a/R/LTDataset.R +++ b/R/LTDataset.R @@ -42,13 +42,13 @@ #' to add missing values interactively.} #' \item{`create_attribute_region_datatable()`}{returns a data.table of matched #' attributes. Can be used for further manual checks.} -#' \item{`encode_new_s_attribute(add_temporarily)`}{encodes the new attribute +#' \item{`encode_new_s_attribute()`}{encodes the new attribute #' as a structural attribute (for CWB)} #' } #' @export LTDataset #' @importFrom magrittr %>% #' @importFrom R6 R6Class -#' @importFrom data.table data.table := is.data.table setnames setorder rleid +#' @importFrom data.table data.table := is.data.table setnames setorder rleid rleidv #' @importFrom polmineR size s_attributes corpus decode #' @importFrom cwbtools s_attribute_encode registry_file_parse #' @importFrom RcppCWB cl_cpos2struc cl_struc2str @@ -175,7 +175,7 @@ LTDataset <- R6Class( self$textual_data_type <- textual_data_type self$split_by <- split_by self$forced_encoding <- forced_encoding - + # if the columns are the same, there might not be a name. if (is.null(names(match_by))) { names(match_by) <- match_by @@ -201,7 +201,7 @@ LTDataset <- R6Class( if (!is.character(self$textual_data)) { # if textual_data is no character, it might be a subcorpus, etc. Take - # the corpus from slot? + # the corpus from slot. if ("corpus" %in% slotNames(self$textual_data)) { self$cpos_left = self$textual_data@cpos[1, 1] # take first row, first column from the cpos slot of the object @@ -216,6 +216,8 @@ LTDataset <- R6Class( if (!is.null(self$textual_data) & self$attribute_name_in_corpus %in% polmineR::s_attributes(self$textual_data)) { + stop("... attribute to add already exists.") + # the new var already exists as an s-attribute. What to do? modify, # overwrite, stop? Modification would be best, keeping entries which aren't # "NA"? @@ -226,7 +228,11 @@ LTDataset <- R6Class( if (any(!as.character(self$match_by) %in% colnames(external_resource))) stop("... not all variables in match_by are in the external dataset.") if (!is.null(self$split_by)) { - if (!self$split_by %in% polmineR::s_attributes(self$textual_data)) stop(sprintf("... the variable defined in split_by (%s) is no structural attribute in %s", self$split_by, self$textual_data)) + if (!self$split_by %in% polmineR::s_attributes(self$textual_data)) { + stop(sprintf("... the variable defined in split_by (%s) is no structural attribute in %s", + self$split_by, + self$textual_data)) + } } } else if (is.character(self$textual_data) & self$textual_data_type == "xml") { @@ -270,13 +276,18 @@ LTDataset <- R6Class( cat("LTDataset Object: \n") cat(" Corpus: ", corpus_obj_name, "\n", sep = "") cat(" Attribute to be added: ", self$attribute_name_in_corpus, "\n", sep = "") - cat(" Via: ", paste(names(self$match_by), collapse = ", "), "\n", sep = "") + cat(" via: ", paste(names(self$match_by), collapse = ", "), "\n", sep = "") invisible(self) }, #' @description perform the actual merge of the data. #' @param na_value a \code{character vector} indicating which value #' attributes should have that aren't merged. + #' @references When formulating the data.table join functions in + #' `join_textual_and_external_data()`, the following Stack Overflow links + #' were useful: + #' https://stackoverflow.com/questions/45043600/merging-all-column-by-reference-in-a-data-table + #' and https://stackoverflow.com/questions/44433451/r-data-table-update-join join_textual_and_external_data = function(na_value = "NA") { if (self$textual_data_type == "cwb") { @@ -288,8 +299,8 @@ LTDataset <- R6Class( # memory efficient. if (self$verbose) message("... creating empty cpos data.table.") - self$text_dt <- data.table::data.table(cpos = self$cpos_left:self$cpos_right, - key = "cpos") + self$text_dt <- data.table::data.table(cpos = self$cpos_left:self$cpos_right, + key = "cpos") self$text_dt[, eval(names(self$match_by)) := "NA"] self$text_dt[, eval(self$attribute_name_in_corpus) := na_value] @@ -305,36 +316,38 @@ LTDataset <- R6Class( # it might be a subcorpus, etc. split_text_object <- self$textual_data %>% polmineR::split(s_attribute = self$split_by) - - # now we probably don't need the subcorpus anymore because we split - # before. - self$textual_data <- self$textual_data@corpus - } + } if (self$verbose) message("... for each split, retrieve s-attributes for comparison.") garbage <- lapply(1:length(split_text_object), function(i_split) { - if (self$verbose) message(sprintf("... processing split %s out of %s based on s-attribute %s.", i_split, length(split_text_object), self$split_by)) - - # decode doesn't work that well for nested corpora with attributes on - # both protocol and speaker level. Use RcppCWB directly. + if (self$verbose) message(sprintf("... processing split %s out of %s based on s-attribute %s.", + i_split, + length(split_text_object), + self$split_by)) if (self$verbose) message("...... extract s-attributes the data should be matched by.") current_split_cpos <- min(split_text_object[[i_split]]@cpos):max(split_text_object[[i_split]]@cpos) + if (is.character(self$textual_data)) { + corpus_name <- self$textual_data + } else { + corpus_name <- self$textual_data@corpus + } + s_attr_streams_for_split <- lapply(names(self$match_by), function(s_attr) { - retval <- RcppCWB::cl_cpos2struc(corpus = self$textual_data, - s_attribute = s_attr, - cpos = current_split_cpos) %>% - RcppCWB::cl_struc2str(corpus = self$textual_data, - s_attribute = s_attr, + retval <- RcppCWB::cl_cpos2struc(corpus = corpus_name, + s_attribute = s_attr, + cpos = current_split_cpos) %>% + RcppCWB::cl_struc2str(corpus = corpus_name, + s_attribute = s_attr, struc = .) # sometimes if the corpus is latin1 and the locale is UTF-8 - if (!is.null(self$forced_encoding)) retval <- iconv(retval, polmineR::encoding(self$textual_data), self$forced_encoding) + if (!is.null(self$forced_encoding)) retval <- iconv(retval, polmineR::encoding(corpus_name), self$forced_encoding) return(retval) } ) @@ -368,15 +381,10 @@ LTDataset <- R6Class( if (self$verbose) message("...... bind split stream to the nearly empty data.table.") data.table::setkey(s_attr_streams_for_split_df, cpos) - # https://stackoverflow.com/questions/45043600/merging-all-column-by-reference-in-a-data-table - cols_to_add <- setdiff(names(s_attr_streams_for_split_df), "cpos") self$text_dt[s_attr_streams_for_split_df, (cols_to_add) := mget(paste0("i.", cols_to_add))] - # https://stackoverflow.com/questions/44433451/r-data-table-update-join - - rm(s_attr_streams_for_split_df) return(NULL) } @@ -386,19 +394,22 @@ LTDataset <- R6Class( if (self$verbose) message("... decoding the entire object at once.") - self$text_dt <- polmineR::decode(self$textual_data, - s_attributes = names(self$match_by), - p_attributes = character(), - to = "data.table") - - if (self$verbose) message(sprintf('...... merge external attribute with decoded token stream.')) + self$text_dt <- polmineR::decode(self$textual_data, + s_attributes = names(self$match_by), + p_attributes = character(), + to = "data.table") - self$text_dt[self$external_resource, on = names(self$match_by), - (self$attribute_name_in_corpus) := get(paste0("i.", self$attribute_name_in_corpus))] + # we remove struc which introduces a lot more regions than probably necessary. + if ("struc" %in% colnames(self$text_dt)) { + self$text_dt[, struc := NULL] + } - # it might make sense to use the approach from above here anyway + if (self$verbose) message(sprintf('...... merge external attribute with decoded token stream.')) - } + self$text_dt[self$external_resource, on = names(self$match_by), + (self$attribute_name_in_corpus) := get(paste0("i.", self$attribute_name_in_corpus))] + + } # if CWB, then make region matrix self$external_attribute_to_region_matrix() @@ -413,88 +424,41 @@ LTDataset <- R6Class( #' @description transform the matched data to a matrix for encoding. external_attribute_to_region_matrix = function() { - if (self$verbose) message("... reduce data.table to minimum.") - - columns_to_drop <- setdiff(colnames(self$text_dt), c("cpos", self$attribute_name_in_corpus)) - self$text_dt[, (columns_to_drop) := NULL] - - # if only parts of the corpus are encoded, there might be corpus positions - # not covered by self$cpos_left:self$cpos_right. It is probably wise to - # fill the corpus positions. - - # Note: If the object is a subcorpus, this only applies to the range of - # the subcorpus. This is why the size of the corpus, not the object, is - # taken now. - - if (isTRUE(is.character(self$textual_data))) { - all_cpos_in_corpus <- 0L:(polmineR::size(self$textual_data) - 1) - } else { - # otherwise, its a subcorpus and the name of the corpus is taken from - # the slot. - all_cpos_in_corpus <- 0L:(polmineR::size(self$textual_data@corpus) - 1) - } - - cpos_missing <- which(!all_cpos_in_corpus %in% self$cpos_left:self$cpos_right) - 1 - - # This is slow but setdiff seems to be even slower here - - # Note: Actually one should probably allow for the additions of new IDs - # even if some already exist? This could be done by looking if the corpus - # to add is already in the corpus. - - if (self$attribute_name_in_corpus %in% polmineR::s_attributes(self$textual_data)) { - stop("... attribute to add already exists.") - } - - if (length(cpos_missing) > 0) { - # make data.table with all missing cpos and rbind - - missing_dt <- data.table::data.table( - cpos = cpos_missing - ) - - missing_dt[, (self$attribute_name_in_corpus) := NA] - self$text_dt <- rbind(self$text_dt, missing_dt) - data.table::setorder(self$text_dt, cpos) - } - cpos_vec <- self$text_dt[["cpos"]] if (self$verbose) message("... preparing breaks for the encoding process.") - breaks <- match(self$text_dt[[self$attribute_name_in_corpus]], - unique(self$text_dt[[self$attribute_name_in_corpus]])) - 1 - - breaks_rle <- rle(breaks) - + # prepare breaks every time the speaker changes + self$text_dt[, row_index := data.table::rleidv(self$text_dt, cols = setdiff(colnames(self$text_dt), c("cpos", self$attribute_name_in_corpus)))] + + breaks <- match(unique(self$text_dt$row_index), self$text_dt$row_index) + cpos_breaks <- c(cpos_vec[breaks], cpos_vec[length(cpos_vec)]) + id_factor <- cut( x = cpos_vec, - breaks = cumsum(c(0, breaks_rle$lengths)), # adding the first one here is important + breaks = cpos_breaks, include.lowest = TRUE, right = FALSE ) - rm(breaks, breaks_rle) - if (self$verbose) message("... preparing the region matrix for the encoding process.") id_cpos <- unname(split(x = cpos_vec, f = id_factor)) self$region_matrix <- do.call(rbind, lapply(id_cpos, function(cpos) c(cpos[1L], cpos[length(cpos)]))) - # now we need to get the values of the ids per row of the region matrix - self$text_dt[, row_in_region_matrix := data.table::rleid(get(self$attribute_name_in_corpus))] - - # remove cpos column and then unique self$text_dt[, cpos := NULL] self$text_dt <- unique(self$text_dt) # this might not be very memory effective self$values <- as.character(self$text_dt[[self$attribute_name_in_corpus]]) - # last sanity checks - + # Last Sanity Checks + # the last cpos of the region matrix must be equal to the size of - # the corpus (-1 for 0-indexation) + # the corpus (-1 for 0-indexation). + + cpos_start <- self$region_matrix[1, 1] + cpos_end <- self$region_matrix[nrow(self$region_matrix), 2] - stopifnot(self$region_matrix[nrow(self$region_matrix), 2] == (polmineR::size(self$textual_data) - 1)) + stopifnot((cpos_end - cpos_start) == (polmineR::size(self$textual_data) - 1)) # there must be as many values as regions stopifnot(length(self$values) == nrow(self$region_matrix)) @@ -502,34 +466,46 @@ LTDataset <- R6Class( }, #' @description encodes the new attribute as a structural attribute (for CWB). - #' @param add_temporarily whether to add the structural attribute temporarily - #' or permanently. #' @return A new `LTDataset` object. - encode_new_s_attribute = function(add_temporarily) { + encode_new_s_attribute = function() { - corpus_name <- ifelse(is.character(self$textual_data), self$textual_data, self$textual_data@corpus) - - data_dir_path <- cwbtools::registry_file_parse(corpus = corpus_name)[["home"]] - registry_dir_path <- RcppCWB::corpus_registry_dir(corpus_name) + # if this is only a subcorpus, then we might want to add all cpos which + # are not part of the subcorpus and add a value of "NA" to the missing + # cpos. + + self$add_missing_regions() + + # replace all missing/NA values with literal "NA". Not done earlier + # because in the shiny application user input is still possible. - if (isTRUE(add_temporarily)) { - delete_param <- TRUE + self$values[is.na(self$values)] <- "NA" + + corpus_name <- ifelse(is.character(self$textual_data), self$textual_data, self$textual_data@corpus) + corpus_registry <- RcppCWB::corpus_registry_dir(corpus = corpus_name) + + corpus_encoding <- cwbtools::registry_file_parse(corpus = corpus_name, + registry = corpus_registry)[["properties"]][["charset"]] + + if (is.character(self$textual_data)) { + data_dir_path <- RcppCWB::corpus_data_dir(corpus = self$textual_data, + registry = corpus_registry) } else { - delete_param <- FALSE + data_dir_path <- RcppCWB::corpus_data_dir(corpus = self$textual_data@corpus, + registry = corpus_registry) } - + if (self$verbose) message("... start encoding the s-attribute.") - - s_attribute_encode( + + cwbtools::s_attribute_encode( values = self$values, data_dir = data_dir_path, s_attribute = tolower(self$attribute_name_in_corpus), corpus = corpus_name, region_matrix = self$region_matrix, method = self$encoding_method, - registry_dir = registry_dir_path, - encoding = registry_file_parse(corpus = corpus_name)[["properties"]][["charset"]], - delete = delete_param, + registry_dir = corpus_registry, + encoding = corpus_encoding, + delete = TRUE, verbose = FALSE ) @@ -538,17 +514,35 @@ LTDataset <- R6Class( }, #' @description returns a data.table of matched attributes. Can be used for - #' further manual checks. - #' @param verbose \code{logical} - create_attribute_region_datatable = function(verbose = FALSE) { - - if (verbose) message("... get attributes which were used for matching.") + #' further manual checks. + #' @param additional_attributes `a character vector` of additional + #' structural attributes which should be considered when evaluating the + #' results of the linkage. + #' @param verbose \code{logical} + #' @details `additional_attributes` might be useful when information of a + #' dataset is added which only covers a part of the corpus such as as + #' specific period of time or specific groups of speakers. + create_attribute_region_datatable = function(verbose = FALSE, additional_attributes = NULL) { + + if (!is.null(additional_attributes)) { + stopifnot(additional_attributes %in% polmineR::s_attributes(self$textual_data)) + } + + attributes_to_check <- c(names(self$match_by), additional_attributes) - attrs_by_region <- lapply(names(self$match_by), function(attr_to_check) { - retval <- RcppCWB::cl_cpos2struc(self$textual_data, s_attribute = attr_to_check, cpos = self$region_matrix[, 1]) %>% - RcppCWB::cl_struc2str(corpus = self$textual_data, s_attribute = attr_to_check, struc = .) + if (is.character(self$textual_data)) { + corpus_name <- self$textual_data + cwb_registry <- RcppCWB::corpus_registry_dir(self$textual_data) + } else { + corpus_name <- self$textual_data@corpus + cwb_registry <- RcppCWB::corpus_registry_dir(self$textual_data@corpus) + } + + attrs_by_region <- lapply(attributes_to_check, function(attr_to_check) { + retval <- RcppCWB::cl_cpos2struc(corpus_name, s_attribute = attr_to_check, cpos = self$region_matrix[, 1], registry = cwb_registry) %>% + RcppCWB::cl_struc2str(corpus = corpus_name, s_attribute = attr_to_check, struc = ., registry = cwb_registry) - if (!is.null(self$forced_encoding)) retval <- iconv(retval, polmineR::encoding(self$textual_data), self$forced_encoding) + if (!is.null(self$forced_encoding)) retval <- iconv(retval, polmineR::encoding(corpus_name), self$forced_encoding) return(retval) } @@ -557,7 +551,12 @@ LTDataset <- R6Class( self$attrs_by_region_dt <- data.table::as.data.table(do.call("cbind", attrs_by_region)) rm(attrs_by_region) - colnames(self$attrs_by_region_dt) <- names(self$match_by) + if (!is.null(additional_attributes)) { + colnames(self$attrs_by_region_dt) <- c(names(self$match_by), additional_attributes) + } else { + colnames(self$attrs_by_region_dt) <- names(self$match_by) + } + self$attrs_by_region_dt[, (self$attribute_name_in_corpus) := self$values] invisible(self) @@ -565,27 +564,63 @@ LTDataset <- R6Class( #' @description check the completeness of the merging operation and add #' missing values interactively. - #' @param check_for_groups a \code{character vector}; in case not all - #' elements are expected to have been matched, filter which elements - #' should be checked. - #' @param modify \code{logical}; whether missing values should not only be + #' @param check_for_groups a named `list`; in case not all elements are + #' expected to have been matched, select which elements should be checked. + #' @param negate `logical`; whether the selection in `check_for_groups` + #' should be negated, i.e. if these elements should be filtered instead of + #' selected. + #' @param modify `logical`; whether missing values should not only be #' inspected but also modified interactively. - #' @param match_fuzzily_by a \code{character vector}; if not NULL, a fuzzy + #' @param match_fuzzily_by a `character vector`; if not NULL, a fuzzy #' match will be performed on the column indicated. - #' @param doc_dir a \code{character vector}; Indicating a directory in which + #' @param doc_dir a `character vector`; Indicating a directory in which #' a text file is created which documents manual changes to the merge. - #' @param verbose \code{logical}; whether to print more comprehensive + #' @param ignore_case `logical` whether to match the case. This is a + #' argument of `stringdist_join` which is used in + #' `fuzzy_join_missing_values` + #' @param dist_method A `character` The measurement of distance used by + #' `stringdist_join`. See `stringdist-metrics` in the `stringdist` package + #' which is used in `fuzzy_join_missing_values`. + #' @param max_dist An `integer` value indicating the maximum distance + #' between the two input vectors. This is a argument of `stringdist_join` + #' which is used in `fuzzy_join_missing_values`. + #' @param verbose `logical`; whether to print more comprehensive #' messages. - check_and_add_missing_values = function(check_for_groups = NULL, - modify = FALSE, + #' @references For fuzzy matching `fuzzyjoin` is a crucial dependency: + #' + #' David Robinson (2020). fuzzyjoin: Join Tables Together on + #' Inexact Matching. R package version 0.1.6. + #' https://CRAN.R-project.org/package=fuzzyjoin + #' @references Using `stringdist_join` in combination with `fuzzy_join` in + #' `fuzzy_join_missing_values()` was inspired by the second answer here: + #' https://stackoverflow.com/questions/48008903/combined-fuzzy-and-exact-matching + #' @references Using `formals` to reset the default values of + #' `stringdist_join` in `fuzzy_join_missing_values()` was inspired by the + #' second answer here: + #' https://stackoverflow.com/questions/27673415/store-function-arguments-inside-the-function-and-apply-them-for-future-use-in-r + #' @references Large parts of `add_missing_attributes_via_shiny()` are taken + #' from the implementation of rhandsontable and shinyWidgets in: Blaette, + #' Andreas (2023). polmineR: Verbs and Nouns for Corpus Analysis. R + #' package version v0.8.8. + check_and_add_missing_values = function(check_for_groups = NULL, + negate = FALSE, + modify = FALSE, match_fuzzily_by = NULL, - doc_dir = NULL, + doc_dir = NULL, + ignore_case = TRUE, + dist_method = "lv", + max_dist = 4L, verbose = TRUE) { if (!is.null(check_for_groups)) { if (verbose) message("... subsetting by groups which should have been matched.") for (i in 1:length(check_for_groups)) { - attrs_by_region_dt_min <- self$attrs_by_region_dt[get(names(check_for_groups)[[i]]) == check_for_groups[[i]], ] + group_name <- names(check_for_groups)[[i]] + if (isTRUE(negate)) { + attrs_by_region_dt_min <- self$attrs_by_region_dt[!get(group_name) %in% check_for_groups[[i]], ] + } else { + attrs_by_region_dt_min <- self$attrs_by_region_dt[get(group_name) %in% check_for_groups[[i]], ] + } attrs_by_region_dt_min <- unique(attrs_by_region_dt_min[is.na(get(self$attribute_name_in_corpus))]) } } else { @@ -594,12 +629,9 @@ LTDataset <- R6Class( if (nrow(attrs_by_region_dt_min) > 0) { - if (modify) { - - # if there are missing values and modify is TRUE, then we might add values - # manually. + if (isTRUE(modify)) { - add_manually <- menu(title = "After inspecting the results, do you want to add values for the missing attributes manually?\n\nThese additions are added to the value vector used for encoding.\n\nThese manual additions are documented in a log file.\n\nAlternatively, modify ID resource and redo?", + add_manually <- menu(title = "After inspecting the results, do you want to add values for the missing attributes manually?\n\nThese additions are added to the value vector used for encoding.\n\nThese manual additions are documented in a log file.\n\nAlternatively, modify ID resource and redo.", choices = c("Yes", "No")) if (!exists("add_manually") || add_manually == 2) { @@ -608,54 +640,54 @@ LTDataset <- R6Class( if (is.null(doc_dir)) stop("No existing directory provided.") - if (!is.null(match_fuzzily_by)) { # if it is allowed to match fuzzily, the attribute stated in the # attribute is matched via fuzzy matching while all other # variables are used for literate matching. - attrs_by_region_dt_min <- self$fuzzy_join_missing_values(attrs_by_region_dt_min, match_fuzzily_by) - + attrs_by_region_dt_min_mod <- self$fuzzy_join_missing_values(attrs_by_region_dt_min = attrs_by_region_dt_min, + match_fuzzily_by = match_fuzzily_by, + ignore_case = ignore_case, + dist_method = dist_method, + max_dist = max_dist) + + attrs_by_region_dt_min_mod <- self$add_missing_attributes_via_shiny(y = attrs_by_region_dt_min_mod, + doc_dir = doc_dir) + } else { + + attrs_by_region_dt_min_mod <- self$add_missing_attributes_via_shiny(y = attrs_by_region_dt_min, + doc_dir = doc_dir) } - - # make shiny / rhandsontable with log: LTD$attribute_name_in_corpus - # added manually in the following instances: ... (simply by subsetting - # changed rows?) - - attrs_by_region_dt_min <- self$add_missing_attributes_via_shiny(y = attrs_by_region_dt_min, doc_dir = doc_dir) - + # first check if any of the values should not be kept - keep_not_idx <- which(attrs_by_region_dt_min[["keep"]] == FALSE) - if (length(keep_not_idx) > 0) attrs_by_region_dt_min <- attrs_by_region_dt_min[-keep_not_idx, ] + keep_not_idx <- which(attrs_by_region_dt_min_mod[["keep"]] == FALSE) + if (length(keep_not_idx) > 0) attrs_by_region_dt_min_mod <- attrs_by_region_dt_min_mod[-keep_not_idx, ] # then we need to check if each speaker only occurs once with the # actual attribute columns. cols_to_keep_after_shiny <- c(names(self$match_by), self$attribute_name_in_corpus) - if (nrow(unique(attrs_by_region_dt_min[, ..cols_to_keep_after_shiny])) != nrow(attrs_by_region_dt_min)) { + if (nrow(unique(attrs_by_region_dt_min_mod[, ..cols_to_keep_after_shiny])) != nrow(attrs_by_region_dt_min)) { stop("... there seems to be a problem as there are more rows than unique attribute combinations, meaning that a single value has been added more than once.") - - # IDEALLY THIS WOULD TRIGGER THE SHINY APPLICATION AGAIN } else { - attrs_by_region_dt_min <- attrs_by_region_dt_min[, ..cols_to_keep_after_shiny] + attrs_by_region_dt_min_mod <- attrs_by_region_dt_min_mod[, ..cols_to_keep_after_shiny] } - attrs_by_region_dt_min_added <- attrs_by_region_dt_min[!is.na(get(self$attribute_name_in_corpus)), ] - self$missing_after_check <- attrs_by_region_dt_min[is.na(get(self$attribute_name_in_corpus)), ] + attrs_by_region_dt_min_added <- attrs_by_region_dt_min_mod[!is.na(get(self$attribute_name_in_corpus)), ] + self$missing_after_check <- attrs_by_region_dt_min_mod[is.na(get(self$attribute_name_in_corpus)), ] # indicate that this is added attrs_by_region_dt_min_added[, added := TRUE] # we have to change the values in self$values here - join_cols <- setdiff(names(attrs_by_region_dt_min), self$attribute_name_in_corpus) + join_cols <- setdiff(names(attrs_by_region_dt_min_mod), self$attribute_name_in_corpus) self$attrs_by_region_dt[attrs_by_region_dt_min_added, - c((self$attribute_name_in_corpus), "added") := .(get(paste0("i.", (self$attribute_name_in_corpus))), i.added), - on = join_cols] + c((self$attribute_name_in_corpus), "added") := .(get(paste0("i.", (self$attribute_name_in_corpus))), i.added), + on = join_cols] # now which rows in attrs_by_region_dt are those added. - rows_to_add_idx <- self$attrs_by_region_dt[added == TRUE, which = TRUE] if (length(rows_to_add_idx) > 0) { @@ -675,7 +707,8 @@ LTDataset <- R6Class( invisible(self) }, - #' @description Add missing values with shiny and rhandsontable. Called by `check_and_add_missing_values()`. + #' @description Add missing values with shiny and rhandsontable. Called by + #' `check_and_add_missing_values()`. #' @param y a \code{data.table} containing incomplete rows after the merge. #' @param doc_dir a \code{character vector}; Indicating a directory in which #' a text file is created which documents manual changes to the merge. @@ -684,11 +717,10 @@ LTDataset <- R6Class( # add keep column and set to TRUE per default y[, keep := TRUE] - ##### This is taken from the internal and never really used - ##### WikiParliamentaryLookup package (check_and_correct_incomplete_mps.R) which - ##### is based on the implementation of rhandsontable and shinyWidgets used in - ##### polmineR. - + # the first attempts to make this work were done in the internal + # and never really used WikiParliamentaryLookup package + # (check_and_correct_incomplete_mps.R). + # prepare shiny gadget .editing_gadget_ui <- function() { miniUI::miniPage( @@ -729,7 +761,6 @@ LTDataset <- R6Class( y <- shiny::runGadget(.editing_gadget_ui(), server, viewer = shiny::paneViewer(minHeight = 550)) # now extract the added attributes and document. - document_log_file <- sprintf("%s/manual_addition_of_attribute_while_linking_%s.txt", doc_dir, Sys.Date()) @@ -751,124 +782,195 @@ LTDataset <- R6Class( #' @description Add missing values using a fuzzy join. Called by #' `check_and_add_missing_values()`. - #' @param attrs_by_region_dt_min a \code{data.table} containing incomplete - #' rows after the merge. - #' @param match_fuzzily_by a \code{character vector}; if not NULL, a fuzzy - #' match will be performed on the column indicated. - #' @references See the second answer here: - #' https://stackoverflow.com/questions/48008903/combined-fuzzy-and-exact-matching - fuzzy_join_missing_values = function(attrs_by_region_dt_min, match_fuzzily_by) { + #' @param attrs_by_region_dt_min a `data.table` containing incomplete rows + #' after the merge. + #' @param match_fuzzily_by a `character vector`; if not NULL, a fuzzy match + #' will be performed on the column indicated. + #' @param ignore_case `logical` whether to match the case. This is a + #' argument of `stringdist_join`. + #' @param dist_method A `character` The measurement of distance used by + #' `stringdist_join`. See `stringdist-metrics` in the `stringdist` + #' package. + #' @param max_dist An `integer` value indicating the maximum distance + #' between the two input vectors. This is a argument of `stringdist_join`. + fuzzy_join_missing_values = function(attrs_by_region_dt_min, + match_fuzzily_by, + ignore_case = TRUE, + dist_method = "lv", + max_dist = 4L) { message("... performing fuzzy matching.") - # first, define a match function - # https://stackoverflow.com/questions/48008903/combined-fuzzy-and-exact-matching - - match_fun_stringdist <- function(v1, v2) { - - ignore_case = TRUE - method = "lv" - max_dist = 4 - distance_col = "dist" - - if (ignore_case) { - v1 <- stringr::str_to_lower(v1) - v2 <- stringr::str_to_lower(v2) - } - - # maybe handle NAs like that? - if (any(is.na(v1))) v1[is.na(v1)] <- "" - if (any(is.na(v2))) v2[is.na(v2)] <- "" - - # shortcut for Levenshtein-like methods: if the difference in - # string length is greater than the maximum string distance, the - # edit distance must be at least that large - - # length is much faster to compute than string distance - if (method %in% c("osa", "lv", "dl")) { - length_diff <- abs(stringr::str_length(v1) - stringr::str_length(v2)) - include <- length_diff <= max_dist - - dists <- rep(NA, length(v1)) - - dists[include] <- stringdist::stringdist(v1[include], v2[include], method = method) - } else { - # have to compute them all - dists <- stringdist::stringdist(v1, v2, method = method) - } - ret <- dplyr::tibble(include = (dists <= max_dist)) - if (!is.null(distance_col)) { - ret[[distance_col]] <- dists - } - ret - } - - fuzzy_in_textual_data <- as.character(self$match_by[which(names(self$match_by) == match_fuzzily_by)]) - fuzzy_in_external_data <- names(self$match_by)[which(names(self$match_by) == match_fuzzily_by)] - - non_fuzzy_in_textual_data <- as.character(self$match_by[which(names(self$match_by) != match_fuzzily_by)]) - non_fuzzy_in_external_data <- names(self$match_by)[which(names(self$match_by) != match_fuzzily_by)] - - by_x_vector <- c(fuzzy_in_textual_data, non_fuzzy_in_textual_data) - by_y_vector <- c(fuzzy_in_external_data, non_fuzzy_in_external_data) + # column names in external data are renamed in the class slot. + fuzzy_var_name <- names(self$match_by)[match(match_fuzzily_by, names(self$match_by))] + non_fuzzy_var_name <- names(self$match_by)[which(!names(self$match_by) %in% match_fuzzily_by)] + by_vector <- c(fuzzy_var_name, non_fuzzy_var_name) + target_var <- colnames(self$external_resource)[which(!colnames(self$external_resource) %in% by_vector)] # this might be not very robust. - match_fun_list <- vector("list", length(c(fuzzy_in_textual_data, non_fuzzy_in_textual_data))) + match_fun_list <- vector("list", length(c(fuzzy_var_name, non_fuzzy_var_name))) - for (i in 1:length(fuzzy_in_textual_data)) { - match_fun_list[[i]] <- match_fun_stringdist + # replacing the default values of fuzzyjoin::stringdist_join + stringdist_join_match_fun_for_list <- self$stringdist_join_match_fun + + formals(stringdist_join_match_fun_for_list)[["ignore_case"]] <- ignore_case + formals(stringdist_join_match_fun_for_list)[["method"]] <- dist_method + formals(stringdist_join_match_fun_for_list)[["max_dist"]] <- max_dist + + for (i in 1:length(fuzzy_var_name)) { + match_fun_list[[i]] <- stringdist_join_match_fun_for_list } - for (i in (length(fuzzy_in_textual_data) + 1):length(c(fuzzy_in_textual_data, non_fuzzy_in_textual_data))) { + for (i in (length(fuzzy_var_name) + 1):length(c(fuzzy_var_name, non_fuzzy_var_name))) { match_fun_list[[i]] <- eval(`==`) } attrs_by_region_dt_min_joined <- fuzzyjoin::fuzzy_join( x = attrs_by_region_dt_min, - y = self$external_resource, - by = list(x = by_x_vector, - y = by_y_vector - ), - + y = self$external_resource, + by = by_vector, match_fun = match_fun_list, mode = "left" ) # clean up. Remove all ".y" columns except for the fuzzily joined # attribute and all dist column. + attrs_by_region_dt_min_joined <- data.table::as.data.table(attrs_by_region_dt_min_joined) - columns_to_omit <- c(paste0(non_fuzzy_in_textual_data, ".y"), - paste0(c(fuzzy_in_textual_data, non_fuzzy_in_textual_data), ".dist") - ) + + columns_to_omit <- c(paste0(by_vector, ".y"), paste0(by_vector, ".distance_col")) + fuzzy_var_name_joined <- paste0(fuzzy_var_name, ".y") + columns_to_omit <- columns_to_omit[columns_to_omit != fuzzy_var_name_joined] attrs_by_region_dt_min_joined[, (columns_to_omit) := NULL] # then use the joined new attribute to add to the missing values - attribute_name_in_corpus_in_x <- paste0(self$attribute_name_in_corpus, ".x") attribute_name_in_corpus_in_y <- paste0(self$attribute_name_in_corpus, ".y") # we remove the column with the old, missing values attrs_by_region_dt_min_joined[, (attribute_name_in_corpus_in_x) := NULL] - + # and rename it accordingly - data.table::setnames(attrs_by_region_dt_min_joined, old = attribute_name_in_corpus_in_y, new = self$attribute_name_in_corpus) - + data.table::setnames(attrs_by_region_dt_min_joined, + old = attribute_name_in_corpus_in_y, + new = self$attribute_name_in_corpus) + # and do this for all the other textual data columns (".x") as well. data.table::setnames(attrs_by_region_dt_min_joined, old = paste0(names(self$match_by), ".x"), new = names(self$match_by)) - - # finally, we rename the added fuzzily matched match variable to make + + # finally, we rename the fuzzily matched match variable to make # transparent - data.table::setnames(attrs_by_region_dt_min_joined, old = paste0(fuzzy_in_external_data, ".y"), new = paste0(fuzzy_in_external_data, "_fuzzy_matched")) - data.table::setcolorder(attrs_by_region_dt_min_joined, c(names(self$match_by), - self$attribute_name_in_corpus, - paste0(fuzzy_in_external_data, "_fuzzy_matched"))) + + data.table::setnames(attrs_by_region_dt_min_joined, + old = fuzzy_var_name_joined, + new = paste0(fuzzy_var_name, "_fuzzy_matched")) + + data.table::setcolorder(attrs_by_region_dt_min_joined, c(names(self$match_by), + paste0(fuzzy_var_name, "_fuzzy_matched"), + target_var)) return(attrs_by_region_dt_min_joined) + }, + + #' @description The function used for those attributes which are matched + #' fuzzily. Called by `fuzzy_join_missing_values()`. This function is + #' identical to the "match_fun" of `stringdist_join()` of the `fuzzyjoin` + #' package with changed defaults. + #' @details The defaults were changed. "lv" is now the default method, + #' max_dist is 4 and ignore_case is now TRUE. + #' @param v1,v2 `character vectors` to be compared fuzzily. + #' @param ignore_case `logical` whether to match the case. This is a + #' argument of `stringdist_join`. + #' @param method A `character` The measurement of distance used by + #' `stringdist_join`. See `stringdist-metrics` in the `stringdist` + #' package. + #' @param max_dist An `integer` value indicating the maximum distance + #' between the two input vectors. This is a argument of `stringdist_join`. + stringdist_join_match_fun = function(v1, + v2, + max_dist = 4L, + method = c("lv", "osa", "dl", + "hamming", "lcs", "qgram", + "cosine", "jaccard", "jw", + "soundex"), + ignore_case = TRUE) { + method <- match.arg(method) + + if (method == "soundex") { + max_dist <- 0.5 + } + + if (ignore_case) { + v1 <- stringr::str_to_lower(v1) + v2 <- stringr::str_to_lower(v2) + } + if (method %in% c("osa", "lv", "dl")) { + length_diff <- abs(stringr::str_length(v1) - stringr::str_length(v2)) + include <- length_diff <= max_dist + dists <- rep(NA, length(v1)) + dists[include] <- stringdist::stringdist(v1[include], + v2[include], + method = method) + } + else { + dists <- stringdist::stringdist(v1, v2, method = method) + } + ret <- tibble::tibble(include = (dists <= max_dist)) + ret[["distance_col"]] <- dists + return(ret) + }, + + #' @description The function checks if the entire corpus is covered by the + #' region matrix. If not, the missing regions of the corpus are added to + #' the region matrix and the value vector. This is needed to encode the + #' new data. + add_missing_regions = function() { + + if (self$verbose) message("... adding missing regions to region matrix and value vector.") + + if (isTRUE(is.character(self$textual_data))) { + + all_cpos_in_corpus <- 0L:(polmineR::size(self$textual_data) - 1) + + } else { + + # otherwise, its a subcorpus and the name of the corpus is taken from + # the slot. + + all_cpos_in_corpus <- 0L:(polmineR::size(self$textual_data@corpus) - 1) + } + + cpos_missing <- which(!all_cpos_in_corpus %in% self$cpos_left:self$cpos_right) - 1 + + if (length(cpos_missing) > 0) { + + # see stack overflow for cumsum: https://stackoverflow.com/questions/24837401/find-consecutive-values-in-vector-in-r + missing_consecutive <- split(cpos_missing, cumsum(c(1, diff(cpos_missing) != 1))) + missing_matrix <- t(sapply(missing_consecutive, function(x) c(min(x), max(x)))) + rownames(missing_matrix) <- NULL + + # bind it at the end of the region_matrix + self$region_matrix <- rbind(self$region_matrix, missing_matrix) + + # and add NA value to the values slot + self$values <- c(self$values, + rep(x = NA, times = nrow(missing_matrix))) + + # reorder by first cpos per region + order_vector <- order(self$region_matrix[, 1]) + self$region_matrix <- self$region_matrix[order_vector, ] + self$values <- self$values[order_vector] + + # last sanity check here + stopifnot(nrow(self$region_matrix) == length(self$values)) + } + + invisible(self) } ) ) diff --git a/R/LinkTools.R b/R/LinkTools.R index a82e39d..011b484 100644 --- a/R/LinkTools.R +++ b/R/LinkTools.R @@ -14,23 +14,10 @@ #' (data.table, XML or CWB). #' #' d) the package includes a wrapper for the Named Entity Linking of textual -#' data based on DBPedia Spotlight. +#' data based on DBpedia Spotlight. #' @keywords package #' @docType package #' @aliases LinkTools LinkTools-package #' @name LinkTools-package #' @rdname LinkTools-package NULL - -#' Stammdaten with WikiData-IDs -#' -#' A minimized version of the Stammdaten of the German Bundestag of the 13th and -#' 14th legislative period with added WikiData IDs retrieved via the Wikidata Query -#' Service and added party affiliations specific for the legislative period retrieved -#' from Wikipedia. For preparation see bt_stammdaten. -#' @source https://www.bundestag.de/services/opendata (Creation Date 2021-11-04) -#' @source https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(13._Wahlperiode) (Information Retrieved on 2021-11-23) -#' @source https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(14._Wahlperiode) (Information Retrieved on 2021-11-23) -#' @docType data -#' @keywords datasets -"stammdaten_wikidatafied_2022_02_01_min" diff --git a/data-raw/stammdaten_wikidatafied_2022-02-01.R b/data-raw/stammdaten_wikidatafied_2022-02-01.R deleted file mode 100644 index c2b8a3f..0000000 --- a/data-raw/stammdaten_wikidatafied_2022-02-01.R +++ /dev/null @@ -1,23 +0,0 @@ -# prepare and select Stammdaten (2023-02-06) - -library(dplyr) -library(tidyr) -library(data.table) - -stammdaten_file <- read.csv("~/lab/gitlab/bt_stammdaten/stammdaten_wikidatafied_2022-02-0") - -stammdaten_wikidatafied_2022_02_01_min <- stammdaten_file |> - filter(lp %in% c(13, 14)) |> - select(c(first_name, name_adel, name_praefix, family_name, party_by_lp, lp, parliamentary_group, QID)) |> - mutate(role = "mp") |> - mutate(speaker_full_name = trimws(paste(first_name, name_adel, name_praefix, family_name))) |> - mutate(speaker_full_name = gsub("\\s+", " ", speaker_full_name)) |> - mutate(lp = as.character(lp)) |> - separate_rows(parliamentary_group, sep = "\\|") |> - select(speaker_full_name, party_by_lp, lp, role, QID) |> - rename("party" = "party_by_lp", - "speaker" = "speaker_full_name", - "wikidata_id" = "QID") |> - as.data.table() - -save(stammdaten_wikidatafied_2022_02_01_min, file = "~/lab/github/LinkTools/data/stammdaten_wikidatafied_2022_02_01_min.rda") diff --git a/data/stammdaten_wikidatafied_2022_02_01_min.rda b/data/stammdaten_wikidatafied_2022_02_01_min.rda deleted file mode 100644 index 35cb45b..0000000 Binary files a/data/stammdaten_wikidatafied_2022_02_01_min.rda and /dev/null differ diff --git a/man/LTDataset.Rd b/man/LTDataset.Rd index 3cafc3a..d0219df 100644 --- a/man/LTDataset.Rd +++ b/man/LTDataset.Rd @@ -18,6 +18,13 @@ of datasets which should be linked, i.e. the transformation into a comparable format and the assignment of shared unique identifiers, b) the merge of datasets based on these identifiers, c) the encoding or enrichment of the data with three output formats (data.table, XML or CWB). + +\code{additional_attributes} might be useful when information of a +dataset is added which only covers a part of the corpus such as as +specific period of time or specific groups of speakers. + +The defaults were changed. "lv" is now the default method, +max_dist is 4 and ignore_case is now TRUE. } \section{Usage}{ LTDataset <- LTDataset$new() @@ -59,14 +66,37 @@ operation and add missing values interactively.} to add missing values interactively.} \item{\code{create_attribute_region_datatable()}}{returns a data.table of matched attributes. Can be used for further manual checks.} -\item{\code{encode_new_s_attribute(add_temporarily)}}{encodes the new attribute +\item{\code{encode_new_s_attribute()}}{encodes the new attribute as a structural attribute (for CWB)} } } \references{ -See the second answer here: +When formulating the data.table join functions in +\code{join_textual_and_external_data()}, the following Stack Overflow links +were useful: +https://stackoverflow.com/questions/45043600/merging-all-column-by-reference-in-a-data-table +and https://stackoverflow.com/questions/44433451/r-data-table-update-join + +For fuzzy matching \code{fuzzyjoin} is a crucial dependency: + +David Robinson (2020). fuzzyjoin: Join Tables Together on +Inexact Matching. R package version 0.1.6. +https://CRAN.R-project.org/package=fuzzyjoin + +Using \code{stringdist_join} in combination with \code{fuzzy_join} in +\code{fuzzy_join_missing_values()} was inspired by the second answer here: https://stackoverflow.com/questions/48008903/combined-fuzzy-and-exact-matching + +Using \code{formals} to reset the default values of +\code{stringdist_join} in \code{fuzzy_join_missing_values()} was inspired by the +second answer here: +https://stackoverflow.com/questions/27673415/store-function-arguments-inside-the-function-and-apply-them-for-future-use-in-r + +Large parts of \code{add_missing_attributes_via_shiny()} are taken +from the implementation of rhandsontable and shinyWidgets in: Blaette, +Andreas (2023). polmineR: Verbs and Nouns for Corpus Analysis. R +package version v0.8.8. } \section{Public fields}{ \if{html}{\out{
}} @@ -146,6 +176,8 @@ missing after the manual check} \item \href{#method-LTDataset-check_and_add_missing_values}{\code{LTDataset$check_and_add_missing_values()}} \item \href{#method-LTDataset-add_missing_attributes_via_shiny}{\code{LTDataset$add_missing_attributes_via_shiny()}} \item \href{#method-LTDataset-fuzzy_join_missing_values}{\code{LTDataset$fuzzy_join_missing_values()}} +\item \href{#method-LTDataset-stringdist_join_match_fun}{\code{LTDataset$stringdist_join_match_fun()}} +\item \href{#method-LTDataset-add_missing_regions}{\code{LTDataset$add_missing_regions()}} \item \href{#method-LTDataset-clone}{\code{LTDataset$clone()}} } } @@ -249,17 +281,9 @@ transform the matched data to a matrix for encoding. \subsection{Method \code{encode_new_s_attribute()}}{ encodes the new attribute as a structural attribute (for CWB). \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{LTDataset$encode_new_s_attribute(add_temporarily)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{LTDataset$encode_new_s_attribute()}\if{html}{\out{
}} } -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{add_temporarily}}{whether to add the structural attribute temporarily -or permanently.} -} -\if{html}{\out{
}} -} \subsection{Returns}{ A new \code{LTDataset} object. } @@ -271,13 +295,20 @@ A new \code{LTDataset} object. returns a data.table of matched attributes. Can be used for further manual checks. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{LTDataset$create_attribute_region_datatable(verbose = FALSE)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{LTDataset$create_attribute_region_datatable( + verbose = FALSE, + additional_attributes = NULL +)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{verbose}}{\code{logical}} + +\item{\code{additional_attributes}}{\verb{a character vector} of additional +structural attributes which should be considered when evaluating the +results of the linkage.} } \if{html}{\out{
}} } @@ -291,9 +322,13 @@ missing values interactively. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LTDataset$check_and_add_missing_values( check_for_groups = NULL, + negate = FALSE, modify = FALSE, match_fuzzily_by = NULL, doc_dir = NULL, + ignore_case = TRUE, + dist_method = "lv", + max_dist = 4L, verbose = TRUE )}\if{html}{\out{
}} } @@ -301,19 +336,34 @@ missing values interactively. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{check_for_groups}}{a \code{character vector}; in case not all -elements are expected to have been matched, filter which elements -should be checked.} +\item{\code{check_for_groups}}{a named \code{list}; in case not all elements are +expected to have been matched, select which elements should be checked.} + +\item{\code{negate}}{\code{logical}; whether the selection in \code{check_for_groups} +should be negated, i.e. if these elements should be filtered instead of +selected.} \item{\code{modify}}{\code{logical}; whether missing values should not only be inspected but also modified interactively.} -\item{\code{match_fuzzily_by}}{a \code{character vector}; if not NULL, a fuzzy +\item{\code{match_fuzzily_by}}{a \verb{character vector}; if not NULL, a fuzzy match will be performed on the column indicated.} -\item{\code{doc_dir}}{a \code{character vector}; Indicating a directory in which +\item{\code{doc_dir}}{a \verb{character vector}; Indicating a directory in which a text file is created which documents manual changes to the merge.} +\item{\code{ignore_case}}{\code{logical} whether to match the case. This is a +argument of \code{stringdist_join} which is used in +\code{fuzzy_join_missing_values}} + +\item{\code{dist_method}}{A \code{character} The measurement of distance used by +\code{stringdist_join}. See \code{stringdist-metrics} in the \code{stringdist} package +which is used in \code{fuzzy_join_missing_values}.} + +\item{\code{max_dist}}{An \code{integer} value indicating the maximum distance +between the two input vectors. This is a argument of \code{stringdist_join} +which is used in \code{fuzzy_join_missing_values}.} + \item{\code{verbose}}{\code{logical}; whether to print more comprehensive messages.} } @@ -324,7 +374,8 @@ messages.} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LTDataset-add_missing_attributes_via_shiny}{}}} \subsection{Method \code{add_missing_attributes_via_shiny()}}{ -Add missing values with shiny and rhandsontable. Called by \code{check_and_add_missing_values()}. +Add missing values with shiny and rhandsontable. Called by +\code{check_and_add_missing_values()}. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LTDataset$add_missing_attributes_via_shiny(y, doc_dir)}\if{html}{\out{
}} } @@ -347,20 +398,86 @@ a text file is created which documents manual changes to the merge.} Add missing values using a fuzzy join. Called by \code{check_and_add_missing_values()}. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{LTDataset$fuzzy_join_missing_values(attrs_by_region_dt_min, match_fuzzily_by)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{LTDataset$fuzzy_join_missing_values( + attrs_by_region_dt_min, + match_fuzzily_by, + ignore_case = TRUE, + dist_method = "lv", + max_dist = 4L +)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{attrs_by_region_dt_min}}{a \code{data.table} containing incomplete -rows after the merge.} +\item{\code{attrs_by_region_dt_min}}{a \code{data.table} containing incomplete rows +after the merge.} -\item{\code{match_fuzzily_by}}{a \code{character vector}; if not NULL, a fuzzy -match will be performed on the column indicated.} +\item{\code{match_fuzzily_by}}{a \verb{character vector}; if not NULL, a fuzzy match +will be performed on the column indicated.} + +\item{\code{ignore_case}}{\code{logical} whether to match the case. This is a +argument of \code{stringdist_join}.} + +\item{\code{dist_method}}{A \code{character} The measurement of distance used by +\code{stringdist_join}. See \code{stringdist-metrics} in the \code{stringdist} +package.} + +\item{\code{max_dist}}{An \code{integer} value indicating the maximum distance +between the two input vectors. This is a argument of \code{stringdist_join}.} } \if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LTDataset-stringdist_join_match_fun}{}}} +\subsection{Method \code{stringdist_join_match_fun()}}{ +The function used for those attributes which are matched +fuzzily. Called by \code{fuzzy_join_missing_values()}. This function is +identical to the "match_fun" of \code{stringdist_join()} of the \code{fuzzyjoin} +package with changed defaults. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LTDataset$stringdist_join_match_fun( + v1, + v2, + max_dist = 4L, + method = c("lv", "osa", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", + "soundex"), + ignore_case = TRUE +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{v1, v2}}{\verb{character vectors} to be compared fuzzily.} + +\item{\code{max_dist}}{An \code{integer} value indicating the maximum distance +between the two input vectors. This is a argument of \code{stringdist_join}.} + +\item{\code{method}}{A \code{character} The measurement of distance used by +\code{stringdist_join}. See \code{stringdist-metrics} in the \code{stringdist} +package.} + +\item{\code{ignore_case}}{\code{logical} whether to match the case. This is a +argument of \code{stringdist_join}.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LTDataset-add_missing_regions}{}}} +\subsection{Method \code{add_missing_regions()}}{ +The function checks if the entire corpus is covered by the +region matrix. If not, the missing regions of the corpus are added to +the region matrix and the value vector. This is needed to encode the +new data. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LTDataset$add_missing_regions()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/LinkTools-Package.Rd b/man/LinkTools-Package.Rd new file mode 100644 index 0000000..4bf5f7a --- /dev/null +++ b/man/LinkTools-Package.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LinkTools.R +\name{LinkTools-Package} +\alias{LinkTools-Package} +\title{LinkTools-Package} +\description{ +This package facilitates the linkage of data sets via shared unique +identifiers. Four steps are integrated into this package: a) the preparation +of data sets which should be linked, i.e. the transformation into a comparable +format and the assignment of shared unique identifiers, b) the merge of +data sets based on these identifiers, c) the encoding or enrichment of the +data with three output formats (data.table, XML or CWB). In addition, d), the +package includes a wrapper for the Named Entity Linking of textual data based +on DBpedia Spotlight. +} diff --git a/man/LinkTools-package.Rd b/man/LinkTools-package.Rd index 723703c..1d4fc24 100644 --- a/man/LinkTools-package.Rd +++ b/man/LinkTools-package.Rd @@ -21,6 +21,6 @@ c) the encoding or enrichment of the data with three output formats (data.table, XML or CWB). d) the package includes a wrapper for the Named Entity Linking of textual -data based on DBPedia Spotlight. +data based on DBpedia Spotlight. } \keyword{package} diff --git a/man/stammdaten_wikidatafied_2022_02_01_min.Rd b/man/stammdaten_wikidatafied_2022_02_01_min.Rd deleted file mode 100644 index 96cf525..0000000 --- a/man/stammdaten_wikidatafied_2022_02_01_min.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LinkTools.R -\docType{data} -\name{stammdaten_wikidatafied_2022_02_01_min} -\alias{stammdaten_wikidatafied_2022_02_01_min} -\title{Stammdaten with WikiData-IDs} -\format{ -An object of class \code{data.table} (inherits from \code{data.frame}) with 1397 rows and 5 columns. -} -\source{ -https://www.bundestag.de/services/opendata (Creation Date 2021-11-04) - -https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(13._Wahlperiode) (Information Retrieved on 2021-11-23) - -https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(14._Wahlperiode) (Information Retrieved on 2021-11-23) -} -\usage{ -stammdaten_wikidatafied_2022_02_01_min -} -\description{ -A minimized version of the Stammdaten of the German Bundestag of the 13th and -14th legislative period with added WikiData IDs retrieved via the Wikidata Query -Service and added party affiliations specific for the legislative period retrieved -from Wikipedia. For preparation see bt_stammdaten. -} -\keyword{datasets} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..9c224ce --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/tests.html +# * https://testthat.r-lib.org/reference/test_package.html#special-files + +library(testthat) +library(LinkTools) + +test_check("LinkTools") \ No newline at end of file diff --git a/tests/testthat/test-LTDataset.R b/tests/testthat/test-LTDataset.R new file mode 100644 index 0000000..5db859a --- /dev/null +++ b/tests/testthat/test-LTDataset.R @@ -0,0 +1,258 @@ +test_that("all unique speaker attribute pairs from the text data are kept and no observations are dropped implicitly when enriching entire corpora", { + + withr::local_package("polmineR") + use("polmineR") + + # all unique attribute combinations in GERMAPARLMINI + + s_attributes_gpmini <- s_attributes("GERMAPARLMINI", c("speaker", "party", "protocol_lp")) + s_attribute_combinations <- s_attributes_gpmini[, c("value", "party", "protocol_lp")] |> + unique() |> + nrow() + + # all unique attribute combinations in the enriched data.frame + + LTD <- LTDataset$new(textual_data = "GERMAPARLMINI", + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia", + "protocol_lp" = "legislative_period"), + split_by = NULL, + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD$join_textual_and_external_data() + LTD$create_attribute_region_datatable() + n_attributes_by_region <- LTD$attrs_by_region_dt |> + unique() |> + nrow() + + RcppCWB::cqp_reset_registry() + + expect_equal(n_attributes_by_region, s_attribute_combinations) +}) + +test_that("all unique speaker attribute pairs from the text data are kept and no observations are dropped implicitly when enriching subcorpora", { + + withr::local_package("polmineR") + use("polmineR") + + gpmini_day <- corpus("GERMAPARLMINI") |> + polmineR::subset(date == "2009-11-12") + + s_attributes_gpmini <- s_attributes(gpmini_day, c("speaker", "party")) |> + nrow() + + LTD <- LTDataset$new(textual_data = gpmini_day, + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia"), + split_by = NULL, + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD$join_textual_and_external_data() + LTD$create_attribute_region_datatable() + + n_attributes_by_region <- LTD$attrs_by_region_dt |> + unique() |> + nrow() + + RcppCWB::cqp_reset_registry() + + expect_equal(n_attributes_by_region, s_attributes_gpmini) +} +) + + +test_that("the output is identical regardless of the usage of split_by for corpora", { + + withr::local_package("polmineR") + use("polmineR") + + LTD <- LTDataset$new(textual_data = "GERMAPARLMINI", + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia", + "protocol_lp" = "legislative_period"), + split_by = NULL, + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD$join_textual_and_external_data() + + LTD_split <- LTDataset$new(textual_data = "GERMAPARLMINI", + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia", + "protocol_lp" = "legislative_period"), + split_by = "date", + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD_split$join_textual_and_external_data() + + RcppCWB::cqp_reset_registry() + + expect_identical(LTD_split$text_dt, LTD$text_dt) +}) + + +test_that("the output is identical regardless of the usage of split_by for subcorpora", { + + withr::local_package("polmineR") + use("polmineR") + + gpmini_day <- corpus("GERMAPARLMINI") |> + polmineR::subset(date == "2009-11-12") + + LTD <- LTDataset$new(textual_data = gpmini_day, + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia"), + split_by = NULL, + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD$join_textual_and_external_data() + + LTD_split <- LTDataset$new(textual_data = gpmini_day, + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia"), + split_by = "party", + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD_split$join_textual_and_external_data() + + RcppCWB::cqp_reset_registry() + + expect_identical(LTD_split$text_dt, LTD$text_dt) +}) + + +test_that("the region matrix in the end only contains each cpos once and that the length of unique cpos is equal to the corpus size", { + + withr::local_package("polmineR") + use("polmineR") + + gpmini_day <- corpus("GERMAPARLMINI") |> + polmineR::subset(date == "2009-11-10") + + LTD <- LTDataset$new(textual_data = gpmini_day, + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia"), + split_by = NULL, + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD$join_textual_and_external_data() + LTD$create_attribute_region_datatable() + LTD$add_missing_regions() + + cpos_vector <- apply(LTD$region_matrix, MARGIN = 1, function(row) c(row[[1]]:row[[2]])) |> + unlist() + + RcppCWB::cqp_reset_registry() + + expect_equal(length(cpos_vector), length(unique(cpos_vector))) + expect_equal(length(unique(cpos_vector)), size("GERMAPARLMINI")) +} +) + + +test_that("The new attribute is correctly encoded in the corpus", { + + withr::local_package("polmineR") + use("polmineR") + + # copy corpus to temp dir, using the current default values of + # cwbtools::corpus_copy explicitly. + + new_registry_dir <- fs::path(tempdir(), "cwb", "registry") + new_data_dir <- fs::path(tempdir(), "cwb", "indexed_corpora", tolower("GERMAPARLMINI")) + + cwbtools::corpus_copy(corpus = "GERMAPARLMINI", + registry_dir = RcppCWB::corpus_registry_dir(corpus = "GERMAPARLMINI"), + data_dir = RcppCWB::corpus_data_dir(corpus = "GERMAPARLMINI", + registry = RcppCWB::corpus_registry_dir(corpus = "GERMAPARLMINI")), + registry_dir_new = new_registry_dir, + data_dir_new = new_data_dir) + + cwbtools::corpus_rename( + old = "GERMAPARLMINI", + new = "GERMAPARLMINI_TMP", + registry_dir = new_registry_dir, + verbose = TRUE + ) + + old_corpus_registry <- Sys.getenv("CORPUS_REGISTRY") + + RcppCWB::cqp_initialize(registry = new_registry_dir) + stopifnot("GERMAPARLMINI_TMP" %in% corpus()[["corpus"]]) + + s_attributes_before <- s_attributes("GERMAPARLMINI_TMP") + + gpmini_day <- corpus("GERMAPARLMINI_TMP") |> + polmineR::subset(date == "2009-11-10") + + LTD <- LTDataset$new(textual_data = gpmini_day, + textual_data_type = "cwb", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia"), + split_by = NULL, + verbose = TRUE, + forced_encoding = "UTF-8") + + LTD$join_textual_and_external_data() + LTD$create_attribute_region_datatable() + + LTD$encode_new_s_attribute() + + RcppCWB::cqp_reset_registry() + + s_attributes_after <- s_attributes("GERMAPARLMINI_TMP") + + # expect that new s_attribute is there + expect_true(setdiff(s_attributes_after, s_attributes_before) == "id") + + + # except that this corresponds to the joins in the object + speaker_id_combination_encoded <- s_attributes(gpmini_day, c("speaker", "id")) |> + unique() + + data.table::setorder(speaker_id_combination_encoded, speaker) + + speaker_id_combination_ltd <- LTD$attrs_by_region_dt |> + unique() |> + data.table::setorder(speaker) + + speaker_id_combination_ltd <- speaker_id_combination_ltd[, c("speaker", "id")] + speaker_id_combination_ltd[is.na(speaker_id_combination_ltd$id), "id"] <- "NA" + + speaker_id_data_table <- merge(speaker_id_combination_encoded, speaker_id_combination_ltd, by = "speaker") + + expect_equal(speaker_id_data_table$id.x, speaker_id_data_table$id.y) + + RcppCWB::cqp_reset_registry(registry = old_corpus_registry) +} +) + diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index f291129..eb6f898 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -10,13 +10,20 @@ vignette: > ```{r libraries} library(LinkTools) library(polmineR) +library(btmp) ``` +```{r} +use("polmineR") +set.seed(343) +``` + + # Linktools -The motivation of this set of tools is to link textual data to existing external data sets. The textual data might come in form of XML files, CWB-indexed corpora or Quanteda corpora. +The motivation of this set of tools is to link textual data to existing external data sets. The textual data might come in form of XML files, CWB-indexed corpora or quanteda corpora. -The external datasets might comprise of biographical data such as in the "Stammdaten des Deutschen Bundestages" [@...] or substantial findings in their own right, such as the "BT Vote MP Characteristics" dataset [@...]. +The external datasets might comprise of biographical data such as in the "Stammdaten des Deutschen Bundestages" [@...], substantial findings in their own right, such as the "BT Vote MP Characteristics" dataset [@...] or other structured information of knowledge bases such as Wikidata or DBpedia. # Linktools as a suit of three functions @@ -28,37 +35,50 @@ This R package facilitates three steps: # The LTDataset class -Textual data rarely is just a collection of tokens but in most cases enriched with metadata. Plenary speeches for example contain information about speakers or the date they were delivered. The `LTDataset` class realizes the linking of textual data and external datasets based on this metadata and essentially *joins two rectangular datasets by specific columns*. In addition, it contains some functionality to check if the data is completely linked (i.e. if the join results in missing values). +Textual data rarely is just a collection of tokens but in most cases enriched with metadata. Plenary speeches for example contain information about speakers or the date they were delivered. The `LTDataset` class realizes the linking of textual data and external datasets based on this metadata and essentially *joins two rectangular datasets by specific columns*. In addition, it contains some functionality to check if the data is completely linked (i.e. if the join results in missing values) and provides possibilities to add missing information both fuzzily and manually. ## Requirements -The `LTDataset` class merges the textual data and the external data by joining them based on different attributes in both datasets. +The `LTDataset` class merges the textual data and the external data by joining them based on different attributes in both datasets. As argued previously, a robust way to link external datasets and textual data is the use of shared unique identifiers. Following this intuition, we want to add Wikidata-IDs to these speakers. As pointed out in previous considerations [@WORKINGPAPER], these are generally available for a vast amount of entities, stable and extensible by users. -The textual data we want to add external information to is a collection of speeches by members of parliament in the German Bundestag taken from the GermaParl corpus of parliamentary debates [@BlaetteBlessing2018]. The textual data contains several attributes which can be used for merging. To keep the examples a bit smaller, only speeches of the 13th and 14th legislative period are used. +To realize this, the `LTDataset` class also needs the information about which speaker is associated with which ID. We can use a speaker's name, the respective party affiliation and the legislative period as the left sided input of the merge. -```{r germaparl_speaker_with_metadata_sample, echo = FALSE} -germaparl_lp13_lp14 <- subset("GERMAPARL", lp %in% c(13, 14)) +### Textual Data + +In this vignette, the textual data we want to add external information to is a collection of speeches by members of parliament in the German Bundestag. We use a part of the GermaParl corpus of parliamentary debates [@BlaetteBlessing2018] which is provided as sample data in the `polmineR` R package. This sample corpus, GermaParlMini, contains several attributes which can be used for merging. + +While in the example below, the entire corpus is enriched, with the external data, it is also possible to only enrich a part of the corpus (i.e. subcorpora). -germaparl_speaker_with_metadata <- s_attributes(germaparl_lp13_lp14, c("speaker", "party", "lp", "role")) -germaparl_speaker_with_metadata_sample <- germaparl_speaker_with_metadata[role == "mp"][sample(.N, 10)] -data.table::setorder(germaparl_speaker_with_metadata_sample, lp, speaker) +```{r germaparl_speaker_with_metadata_sample, echo = FALSE} +speakers_germaparlmini <- corpus("GERMAPARLMINI") |> + s_attributes(c("speaker", "protocol_lp", "party")) +speakers_germaparlmini <- speakers_germaparlmini[, !c("struc", "cpos_left", "cpos_right")] |> + unique() +data.table::setnames(speakers_germaparlmini, old = "value", new = "speaker") +speakers_germaparlmini_sample <- speakers_germaparlmini[party != "NA"][sample(.N, 10)] +data.table::setorder(speakers_germaparlmini_sample, protocol_lp, speaker) -DT::datatable(germaparl_speaker_with_metadata_sample, - caption = "10 randomly selected members of parliament from the - GermaParl corpus, 13th and 14th legislative period", +DT::datatable(speakers_germaparlmini_sample, + caption = "10 randomly selected members of parliament from the GermaParlMini corpus", options = list(dom = "t"), rownames = FALSE) ``` -As argued previously, a robust way to link external datasets and textual data is the use of shared unique identifiers. Following this intuition, we want to add Wikidata-IDs to the these speakers. As pointed out in previous considerations [@WORKINGPAPER], these are generally available for a vast amount of entities, stable and extendible by users. +### External Data + +The external data is a set of observations which contains information that can be used for matching the data to the metadata in the text corpus as well as additional data that should be added to the corpus. + +In other words, some overlap between the dataset must exist to perform the matching. Often, the overlap exists naturally between different data sources - such as names or party affiliations for members of parliament - but there certainly are instances in which external datasets must be prepared beforehand to facilitate the matching. -To realize this, the class also needs the information about what speaker is associated with which ID. As seen above, we can use a speaker's name, the respective party affiliation, the legislative period and the role as the left sided input of the merge. Preparing the external data in a way these two tables correspond is an upstream task in this sense that it must be prepared beforehand. +In the following, we use data gathered from the `Stammdaten des Deutschen Bundestages` which were enriched with both Wikidata-IDs and the speakers' party affiliation specific for the individual legislative period as per Wikipedia (the Stammdaten themselves only contain static party affiliation which is the most recent party affiliation of a speaker, regardless of politicians switching parties). This data has been prepared earlier and is provided as an R data package `btmp` [@btmp_package]. The preparation of the data is discussed in the corresponding R data package. -In the following, we use data gathered from the `Stammdaten des Deutschen Bundestages` which were enriched with both Wikidata-IDs and the speakers' party affiliation specific for the individual legislative period as per Wikipedia (the Stammdaten themselves only contain static party affiliation which is the most recent party affiliation of a speaker, regardless of politicians switching parties). +The following table shows the external dataset concerning those speakers which have been sampled randomly above. ```{r show_external_data_min_example, echo = FALSE} -external_data_min_example <- LinkTools::stammdaten_wikidatafied_2022_02_01_min[speaker %in% germaparl_speaker_with_metadata_sample[["speaker"]]] -data.table::setorder(external_data_min_example, lp, speaker) +external_data_min_example <- btmp::btmp_de |> + subset(legislative_period == 17) |> + subset(full_name %in% speakers_germaparlmini_sample[["speaker"]]) |> + data.table::setorder(legislative_period, full_name) DT::datatable(external_data_min_example, caption = "Speakers in External Data", @@ -67,60 +87,87 @@ DT::datatable(external_data_min_example, rownames = FALSE) ``` -With these two resources, the task now is to merge the two datasets based on these two tables. +*Note:* This example also illustrates that matching by identical names only is quite challenging. Not all ten speakers could be retrieved by name only because the names in the Stammdaten file (provided by the `btmp` package) and the names in the textual data of GermaParlMini are not identical and thus not all speakers can be matched by their names. The vignette shows how to perform fuzzy matching to address these problems. ## Joining Textual Data and Metadata -The `LTDataset` dataset uses the information provided in the textual data and this external resource to perform matching. Aside from the name of or the path to the textual data (the `textual_data`), its type (for now: "cwb" or "xml") and the external resource (a data.table object), it must be indicated which attribute of the external resource should be added. If the name of the column of the external dataset should not be used as the name of the attribute to be added in the textual data, a named vector could be used instead. In addition, it is possible that the textual data and the external resource do not share the same column names. In this case, a named character vector is used to indicate which column of the textual data corresponds to which column of the external dataset. This might make sense to be more transparent about the variables used from the external dataset. +With these two resources, the task now is to merge the two datasets based on these two tables. + +### Instantiate the Class + +The `LTDataset` dataset uses the information provided in the textual data and this external resource to perform matching. Aside from the name of or the path to the textual data (the `textual_data`), its type (for now: "cwb") and the external resource (a data.table object), it must be indicated which attribute of the external resource should be added. -A challenge is that the occurrence of the combination of a speaker's name, the party affiliation and the legislative period needs to be identified before the linkage can be performed for the entire corpus. This can result in rather large tables. Thus, merging can be done in multiple splits instead of merging the entire corpus. +If the name of the added attribute in the final data should be different than the column name of the external data, a named character vector can be used instead. The name of the vector then represents the final name of the attribute in the enriched textual data. + +In addition, it is possible that the name of the attributes used for matching is different between the textual data and the external data containing the additional information. In this case, a named character vector is used to indicate which column of the textual data corresponds to which column of the external dataset. + +Since identifying all regions of text described by the combination of metadata used to match the data can result in quite large data structured, merging can be done in multiple splits, reducing the memory demand. Given the rather small size of the GERMAPARLMINI corpus, this is not done here. ```{r initialize_class} -LTD <- LTDataset$new(textual_data = germaparl_lp13_lp14, +LTD <- LTDataset$new(textual_data = "GERMAPARLMINI", textual_data_type = "cwb", - external_resource = LinkTools::stammdaten_wikidatafied_2022_02_01_min, - attr_to_add = c("wikidata_id_speaker" = "wikidata_id"), - match_by = c("speaker", "party", "lp", "role"), - split_by = "lp", + external_resource = btmp::btmp_de, + attr_to_add = c("id" = "wikidata_id"), + match_by = c("speaker" = "full_name", + "party" = "party_wikipedia", + "protocol_lp" = "legislative_period"), + split_by = NULL, verbose = TRUE, forced_encoding = "UTF-8") ``` +### Instantiate the Class + After initializing the class, we can run the first method which identifies the parts of the textual data which should be linked and actually performs the linkage. Depending on the `textual_data_type`, different tasks are performed. ```{r join_textual_and_external_data} LTD$join_textual_and_external_data() ``` -For CWB corpora the output of this method is a region matrix and a vector of values which can be used to create structural attributes for corpora stored in the Corpus Workbench. +For CWB corpora the output of this method is a region matrix and a vector of values which can be used to create structural attributes for corpora stored in the CWB format. -For XML, the return value is a data.table with one individual row per unique combination of attributes which is then used to append the XML files. +For XML, the return value should be a data.table with one individual row per unique combination of attributes which is then used to append the XML files. -To validate the matching, a attribute region data.table can be created. +To validate the matching, an attribute region data.table can be created using the `create_attribute_region_datatable()` function. The function also allows to define additional attributes which were not used for the earlier verbatim matching but might be useful for both the fuzzy matching and the manual inspection. For example, specific periods of time or groups of persons which might not be part of the external data and thus cannot be matched in any case could be excluded like that. We omit this step here. ```{r create_attribute_region_datatable} LTD$create_attribute_region_datatable() ``` +Looking at the example data we see that not all speakers could be matched based on the speaker's name, her or his party affiliation and the legislative period. This is often due to speakers not having a party affiliation because they are not members of parliament. In GermaParlMini, the party metadata contains information about parliamentary groups which only concern members of parliament and not other types of speakers such as presidential speakers or governmental actors. These are not part of the dataset we want to link with and thus the linking fails in these instances. + +Looking at other speakers who have not been matched reveals that it is mostly the name of the party affiliation in the textual data which is slightly different. + ```{r print_attributes_by_region_datatable} head(LTD$attrs_by_region_dt) ``` +To provide a systematic way to inspect missing values manually and to add missing attributes, the function `check_and_add_missing_values()` is provided. Knowing that speakers without a party affiliation cannot be matched in any case, these observations can be excluded from the manual inspection. + +```{r} +LTD$attrs_by_region_dt[party != "NA"][is.na(id)] +``` + +If the parameter "match_fuzzily_by" is not NULL, the attribute name provided there there can be used to perform a fuzzy match. All other attributes in "match_by" are matched literally. The result of this matching is then shown in an interactive shiny session in which they can be accepted and kept, modified or refused and omitted. + +The fuzzy match is facilitated by the `fuzzy_join()` function of the package of the same name [@fuzzyjoin2020]. The main driver in the following match is a part of the `stringdist_join()` function to which the arguments `ignore_case`, `dist_method` and `max_dist` can be passed. The following chunk shows the chosen default values (a levenstein distance of a maximum of 4 with casing being ignored). -This also allows for the manual inspection of missing values as well as the manual addition of missing attributes. If the parameter "match_fuzzily_by" is not NULL, the attribute name provided there there can be used to perform a fuzzy match. All other attributes in "match_by" are still matched literally. The result of this matching is then shown in an interactive shiny session in which they can be accepted and kept or refused and omitted. +To document the changes made manually, a log file is created in the directory defined by the `doc_dir` argument. ```{r check_and_add_missing_values, eval = interactive()} -LTD$check_and_add_missing_values(check_for_groups = c("role" = "mp"), +LTD$check_and_add_missing_values(check_for_groups = list("party" = "NA"), + negate = TRUE, modify = TRUE, - match_fuzzily_by = "speaker", - doc_dir = "~/lab/gitlab/tmp", + match_fuzzily_by = c("speaker", "party"), + doc_dir = "~/lab/gitlab/tmp", + ignore_case = TRUE, + dist_method = "lv", + max_dist = 4L, verbose = TRUE) ``` Finally, the values can be added to the corpus by encoding them as a structural attribute. -~~Adding additional attributes can be realized only temporarily (for experimental reasons or if only a single analysis should be performed instead of changing the corpus forever) or persistently. - ```{r encode_new_s_attribute, eval = FALSE} -LTD$encode_new_s_attribute(add_temporarily = TRUE) -``` \ No newline at end of file +LTD$encode_new_s_attribute() +```