xml_enrich works for pre-annotated data and minor adjustments

PolMine · Feb 21, 2024 · 128cc90 · 128cc90
1 parent 7bea856
commit 128cc90
Show file tree

Hide file tree

Showing 8 changed files with 125 additions and 71 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: dbpedia
 Type: Package
 Title: R Wrapper for DBpedia Spotlight
-Version: 0.1.1.9010
-Date: 2024-02-16
+Version: 0.1.1.9011
+Date: 2024-02-21
 Authors@R: c(
   person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
   person("Christoph", "Leonhardt", role = "aut")

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+## dbpedia v0.1.1.9011
+* `xml_enrich()` now adds new attributes to pre-annotated features
+* `get_dbpedia_uris()` method now includes argument `expand_to_token` for subcorpus_bundles as well
+* `map_types_to_class()` works with the list representation in the types column
+
 ## dbpedia v0.1.1.9010
 * new functions `to_annotation()`, `xml_enrich()` `namespaced_xpath()` and method `get_dbpedia_uris()` for xml docs.
 

diff --git a/R/dbpedia.R b/R/dbpedia.R
@@ -566,7 +566,7 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
 #' uritab <- corpus("REUTERS") %>% 
 #'   split(s_attribute = "id", verbose = FALSE) %>% 
 #'   get_dbpedia_uris(language = "en", p_attribute = "word", verbose = TRUE)
-setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), max_len = 5600L, verbose = TRUE, progress = FALSE){
+setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), max_len = 5600L, expand_to_token = FALSE, verbose = TRUE, progress = FALSE){
 
   if (progress){
     env <- parent.frame()
@@ -584,6 +584,7 @@ setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOpti
         max_len = max_len,
         confidence = confidence,
         api = api,
+        expand_to_token = expand_to_token,
         verbose = if (progress) FALSE else verbose
       )
     }

diff --git a/R/utils.R b/R/utils.R
@@ -171,14 +171,23 @@ unique_msg <- function(x, verbose = TRUE){
 
 #' Map types returned by DBpedia Spotlight to a limited set of classes
 #'
+#' This function takes the output of `get_dbpedia_uris()` and compares values in
+#' the `types` column with a named character vector. The main purpose of this
+#' function is to reduce the number of types to a limited set of classes.
+#'
 #' @param x A `data.table` with DBpedia URIs.
-#' @param mapping_vector A `named character vector` with desired class names
-#'     (as names) and types from the DBpedia ontology as values.
-#' @param other a `character vector` with the name of the class of all types
-#'     not matched by the `mapping_vector`.
+#' @param mapping_vector A `named character vector` with desired class names (as
+#'   names) and types from the DBpedia ontology as values. For example:
+#'   c("PERSON" = "DBpedia:Person"). Can contain more than one pair of class and
+#'   type.
+#' @param other a `character vector` with the name of the class of all types not
+#'   matched by the `mapping_vector`.
 #' @param verbose A `logical` value - whether to display messages.
 #' @importFrom data.table is.data.table
 #' @importFrom cli format_error cli_alert_info
+#' @details If there is more than one match between the retrieved types and the
+#'   `mapping vector`, unique classes are sorted alphabetically and collapsed.
+#' @return Function adds classes to input data.table by reference.
 #' @export
 map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE) {
 
@@ -220,8 +229,14 @@ map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE
 
   types_to_class_fun <- function(types) {
 
-    types_with_class <- types |>
-      strsplit(split = ",") |>
+    # types is a list of lists. Transform to single character vector.
+    type_list <- unlist(types, recursive = FALSE)
+
+    types_with_class <- lapply(seq_along(type_list), function(i) {
+      list_name <- names(type_list)[[i]]
+      list_elements <- type_list[[i]]
+      paste0(list_name, ":", list_elements)
+    }) |>
       unlist() |>
       intersect(mapping_vector)
 
@@ -232,6 +247,7 @@ map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE
         names() |>
         _[match_idx] |>
         unique() |>
+        sort() |>
         paste(collapse = "|")
 
     } else {

diff --git a/R/xml.R b/R/xml.R
@@ -1,75 +1,88 @@
 #' Add entities to original XML
-#' 
-#' @param xml ...
-#' @param annotation_dt ...
-#' @param entity_tags ...
-#' @param entity_name ...
-#' @param token_tags ...
-#' @param attributes ...
+#'
+#' Enrich original XML with retrieved Uniform Resource Identifiers and
+#' additional information.
+#'
+#' @param xml The XML document to be modified in place.
+#' @param annotation_dt A `data.frame` or `data.table` of annotations returned
+#'   by `get_dbpedia_uris()`. Must contain token IDs and values to be added
+#'   (e.g. URIs).
+#' @param token_tags A `character vector`, the names of XML nodes containing
+#'   tokens, i.e. which potentially can be (part of) entities.
+#' @param entity_name A `character vector`, the name of added XML nodes
+#'   describing entities.
+#' @param feature_tag A `character vector`, the name of pre-annotated features
+#'   to be used and enriched.
+#' @param attributes A `character vector`, the names of attributes to be added.
+#'   Must correspond to column names in `annotation_dt`.
 #' @export
 #' @importFrom xml2 xml_set_attrs
 xml_enrich <- function(xml,
                        annotation_dt,
-                       entity_tags = c("w", "pc", "name"),
-                       entity_name = "name",
                        token_tags = c("w", "pc"),
+                       entity_name = "name",
+                       feature_tag = NULL,
                        attributes = "dbpedia_uri"
 ) {
-  
+
   # get all nodes which might contain entities
   nodes <- xml |>
-    xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = entity_tags))
-  
+    xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = token_tags))
+
   node_ids <- nodes |>
     xml2::xml_attr("id")
-  
+
   # for each annotation, extract identified words 
-  
+
   for (i in 1:nrow(annotation_dt)) {
-
-    # in theory, an annotation can comprise more than one word
-    annotation_id <- annotation_dt[i, ][["original_id"]] |>
-      strsplit(split = "\\|") |>
-      unlist()
-
+
     # get what to add
     attributes_to_add <- sapply(attributes, function(x) annotation_dt[i, ][[x]])
-
-    # there could be additional values such as the type?
-    nodes_idx <- which(node_ids %in% annotation_id)
-
-    entity_nodes <- nodes[nodes_idx]
-
-    # this assumes that an entity always describes continuous spans which is
-    # plausible.
-
-    # check if nodes are words (or punctuation) or already something like an
-    # entity. If it is already an entity, an attribute should be added. If not,
-    # a new parent node should be added.
-
-    if (all(xml2::xml_name(entity_nodes) %in% token_tags)) {
-
+
+    # distinguish between enriched features (named entities, etc.) and enriched
+    # tokens
+
+    if (is.null(feature_tag)) {
+      # if there is no feature tag, pre-annotated named entities weren't
+      # provided. Add identified named entities to tokens.
+
+      annotation_id <- annotation_dt[i, ][["original_id"]] |>
+        strsplit(split = "\\|") |>
+        unlist()
+
+      # there could be additional values such as the type?
+      nodes_idx <- which(node_ids %in% annotation_id)
+      entity_nodes <- nodes[nodes_idx]
+
       for (node_idx in 1:length(entity_nodes)) {
         if (node_idx == 1) {
           xml2::xml_add_parent(.x = entity_nodes[[node_idx]], .value = entity_name)
         } else {
           xml2::xml_add_sibling(.x = entity_nodes[[1]], .value = entity_nodes[[node_idx]], .copy  = FALSE)
         }
       }
-      
+
       # and set attributes to this new parent node
       xml_set_attrs(xml2::xml_parent(entity_nodes), attributes_to_add)
-
-    } else if (entity_nodes == entity_name) { 
-
-      # this assumes that the new nodes should be the same name as the old
-      # nodes. Are there scenarios in which this does not hold true?
-
-      # if it is a named entity already, set attribute here
-      # Does this work if there are multiple nodes here?
-
-      xml_set_attrs(entity_nodes, attributes_to_add)
-
+
+    } else {
+
+      # else there is a pre-annotated feature such as named entities
+
+      # the ID in the annotation data then points to the first word of the
+      # entity, with a trailing "_FEATURE-TAG" behind the ID itself.
+
+      annotation_id <- annotation_dt[i, ][["original_id"]]
+
+      feature_tag_end <- paste0("_", feature_tag, "$")
+      feature_word_id <- gsub(feature_tag_end, "", annotation_id)
+
+      # get first word node
+      nodes_idx <- which(node_ids %in% feature_word_id)
+      feature_node <- xml2::xml_parent(nodes[nodes_idx])
+
+      xml2::xml_set_attrs(x = feature_node, value = attributes_to_add)
+
     }
   }
 }
@@ -81,7 +94,7 @@ xml_enrich <- function(xml,
 #' @param xml xml the namespace is derived from
 #' @param tags the tags which should be queried with the XPATH#
 #' @export
-namespaced_xpath = function(xml = xml, tags) {
+namespaced_xpath <- function(xml = xml, tags) {
 
   xml_namespace <- xml2::xml_ns(xml)
 

diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd
diff --git a/man/map_types_to_class.Rd b/man/map_types_to_class.Rd
diff --git a/man/xml_enrich.Rd b/man/xml_enrich.Rd