From 2138fade56c0bca78907a405f40972d5169b43d9 Mon Sep 17 00:00:00 2001 From: ChristophLeonhardt <38076703+ChristophLeonhardt@users.noreply.github.com> Date: Mon, 1 Apr 2024 20:31:58 +0200 Subject: [PATCH] XML functionality more robust --- DESCRIPTION | 4 +- NAMESPACE | 7 +- NEWS.md | 7 ++ R/dbpedia.R | 21 +++-- R/xml.R | 171 ++++++++++++++++++++++++++++++++++------ man/get_dbpedia_uris.Rd | 3 +- man/namespaced_xpath.Rd | 2 +- man/xml_enrich.Rd | 19 ++++- 8 files changed, 196 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e32990..86f6fd5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: dbpedia Type: Package Title: R Wrapper for DBpedia Spotlight -Version: 0.1.2.9003 -Date: 2024-03-27 +Version: 0.1.2.9004 +Date: 2024-04-01 Authors@R: c( person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")), person("Christoph", "Leonhardt", role = "aut") diff --git a/NAMESPACE b/NAMESPACE index d8eb812..369cec0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,8 +62,13 @@ importFrom(stringi,stri_c) importFrom(tibble,as_tibble) importFrom(utils,URLencode) importFrom(xml2,read_xml) +importFrom(xml2,xml_add_parent) +importFrom(xml2,xml_add_sibling) importFrom(xml2,xml_attr) importFrom(xml2,xml_children) importFrom(xml2,xml_find_all) -importFrom(xml2,xml_set_attrs) +importFrom(xml2,xml_name) +importFrom(xml2,xml_ns) +importFrom(xml2,xml_parent) +importFrom(xml2,xml_set_attr) importFrom(xml2,xml_text) diff --git a/NEWS.md b/NEWS.md index 5fe269a..22924be 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +## dbpedia v0.1.2.9004 +* improved verbosity of XML processing by implementing the approach used when processing subcorpus bundles for XML nodes as well +* addressed issue with overlapping annotations in XML, described for CWB corpora in issue #43 +* added the possibility to enrich previous named entities without restricting the initial annotation process to them only (I.e. enrich existing and adding new ones is possible now) +* changed encoding of URIs in XML from "dbpedia_uri" to "type" and "ref" +* made the annotation of nested pre-annotated data more robust + ## dbpedia v0.1.2.9003 * started introducing functionality to detect and resolve overlaps (see issue #42) with `detect_overlap()` and `categorize_overlap()` * added `resolve_overlap()` as an (experimental) function to resolve overlaps identified and categorized in `detect_overlap()` and `categorize_overlap()` diff --git a/R/dbpedia.R b/R/dbpedia.R index 7dc9bef..9be6309 100644 --- a/R/dbpedia.R +++ b/R/dbpedia.R @@ -869,7 +869,8 @@ setMethod( support = 20, expand_to_token = FALSE, drop_inexact_annotations = TRUE, - verbose = TRUE + verbose = if (progress) FALSE else verbose, + progress = FALSE ) { # sometimes, there are nodes of the same name in different parts of the @@ -900,7 +901,7 @@ setMethod( tags = segment)) } - if (verbose) cli_progress_step("preparing {.val {length(nodes_to_process)}} annotation tables.") + if (verbose) cli_progress_step("preparing {.val {length(nodes_to_process)}} segments to process.") docs <- to_annotation(nodes = nodes_to_process, xml = x, @@ -943,8 +944,15 @@ setMethod( dt } + if (progress) { + env <- parent.frame() + cli_progress_bar("Tasks", total = length(docs), type = "tasks", .envir = env) + } + annotations <- lapply(docs, function(doc) { - + + if (progress) cli_progress_update(.envir = env) + links <- get_dbpedia_uris( x = doc, language = language, @@ -970,10 +978,11 @@ setMethod( text = .SD[["text"]], types = .SD[["types"]] ), - by = "start", + by = c("start", "end"), .SDcols = c("start", "end", "dbpedia_uri", "text", "types") ] tab[, "start" := NULL] + tab[, "end" := NULL] } else { @@ -1014,7 +1023,9 @@ setMethod( } ) - + + if (progress) cli_progress_done(.envir = env) + data.table::rbindlist(annotations) }) diff --git a/R/xml.R b/R/xml.R index 0e423c3..e79f5e6 100644 --- a/R/xml.R +++ b/R/xml.R @@ -13,32 +13,56 @@ #' describing entities. #' @param feature_tag A `character vector`, the name of pre-annotated features #' to be used and enriched. -#' @param attributes A `character vector`, the names of attributes to be added. -#' Must correspond to column names in `annotation_dt`. +#' @param ref A `character vector`, the name of the URI to be added. +#' Must correspond to a column name in `annotation_dt`. +#' @param type A `character vector`, the name of the entity type to be added. +#' Must correspond to a column name in `annotation_dt`. +#' @details +#' If feature_tag is not NULL, then existing entities are enriched when they +#' entirely overlap with new annotations. In this case, found `type`s are added +#' to the node. If they are not identical, new types are added to previously +#' annotated types with a pipe, indicating different annotation results. +#' Regardless of the value in `feature_tag`, all annotations in the annotation +#' data.table are added. To limit the annotation to entities which correspond to +#' pre-annotated entities, consult the documentation of `get_dbpedia_uris()`. #' @export -#' @importFrom xml2 xml_set_attrs +#' @importFrom xml2 xml_add_parent xml_add_sibling xml_attr xml_add_parent xml_add_sibling xml_children xml_find_all xml_name xml_parent xml_set_attr xml_enrich <- function(xml, annotation_dt, token_tags = c("w", "pc"), entity_name = "name", feature_tag = NULL, - attributes = "dbpedia_uri" + ref = "dbpedia_uri", + type = NULL ) { + if (!ref %in% colnames(annotation_dt)) { + cli::cli_alert_warning(text = "{.var ref} is not in {.strong annotation data.table}. No values will be added.") + } + + if (!type %in% colnames(annotation_dt)) { + cli::cli_alert_warning(text = "{.var type} is not in {.strong annotation data.table}. No values will be added.") + } + + if (is.null(feature_tag)) { + cli::cli_alert_warning(text = "If {.var feature_tag} is NULL, it is assumed that the input data does not contain named entity annotation. If the text is pre-annotated, add the name of the tags used to encode named anntations.") + } + # get all nodes which might contain entities - nodes <- xml2::xml_find_all( + nodes <- xml_find_all( xml, xpath = namespaced_xpath(xml = xml, tags = token_tags) ) - node_ids <- xml2::xml_attr(nodes, "id") + node_ids <- xml_attr(nodes, "id") # for each annotation, extract identified words for (i in 1:nrow(annotation_dt)) { # get what to add - attributes_to_add <- sapply(attributes, function(x) annotation_dt[i, ][[x]]) + ref_to_add <- annotation_dt[i, ][[ref]] + type_to_add <- annotation_dt[i, ][[type]] # distinguish between enriched features (named entities, etc.) and enriched # tokens @@ -59,33 +83,127 @@ xml_enrich <- function(xml, for (node_idx in 1:length(entity_nodes)) { if (node_idx == 1) { - xml2::xml_add_parent(.x = entity_nodes[[node_idx]], .value = entity_name) + xml_add_parent(.x = entity_nodes[[node_idx]], .value = entity_name) } else { - xml2::xml_add_sibling(.x = entity_nodes[[1]], .value = entity_nodes[[node_idx]], .copy = FALSE) + xml_add_sibling(.x = entity_nodes[[1]], .value = entity_nodes[[node_idx]], .copy = FALSE) } } # and set attributes to this new parent node - xml_set_attrs(xml2::xml_parent(entity_nodes), attributes_to_add) + xml_set_attr(xml_parent(entity_nodes), attr = "type", value = type_to_add) + xml_set_attr(xml_parent(entity_nodes), attr = "ref", value = ref_to_add) } else { # else there is a pre-annotated feature such as named entities - # the ID in the annotation data then points to the first word of the - # entity, with a trailing "_FEATURE-TAG" behind the ID itself. + # there are two scenarios: + # 1: feature tags were used when getting DBpedia URIs previously + # 2: feature tags were not used before but now, should be considered when adding tags - annotation_id <- annotation_dt[i, ][["original_id"]] + # In scenario 1, the ID in the annotation data then points to the first + #word of the entity, with a trailing "_FEATURE-TAG" behind the ID itself. + annotation_id <- annotation_dt[i, ][["original_id"]] feature_tag_end <- paste0("_", feature_tag, "$") - feature_word_id <- gsub(feature_tag_end, "", annotation_id) - # get first word node - nodes_idx <- which(node_ids %in% feature_word_id) - feature_node <- xml2::xml_parent(nodes[nodes_idx]) + if (grepl(pattern = feature_tag_end, x = annotation_id)) { + feature_word_id <- gsub(feature_tag_end, "", annotation_id) + + # get first word node + nodes_idx <- which(node_ids %in% feature_word_id) + feature_node <- xml_parent(nodes[nodes_idx]) + + # check if there is already a type in the node + original_type <- xml_attr(x = feature_node, attr = "type") + type_to_add <- paste0(unique(c(type_to_add, original_type)), collapse = "|") + + xml_set_attr(feature_node, attr = "type", value = type_to_add) + xml_set_attr(feature_node, attr = "ref", value = ref_to_add) + + } else { + + # In scenario 2, all tokens are considered to link. Now it is possible + # that there already are named entities annotated. If the new annotation + # and the existing NE annotation completely overlap, the existing named + # entity should be enriched. + + anno_token_ids <- unlist(strsplit(annotation_id, split = "\\|")) + nodes_idx <- which(node_ids %in% anno_token_ids) + feature_node <- xml_parent(nodes[nodes_idx]) - xml2::xml_set_attrs(x = feature_node, value = attributes_to_add) + # if they all have the same parent, add attributes to this parent + # (the check whether all children are in the new annotation is important + # for nested entities). + if (length(feature_node) == 1 & all(xml_attr(xml_children(feature_node), "id") %in% anno_token_ids)) { + + # check if there is already a type in the node + original_type <- xml_attr(x = feature_node, attr = "type") + type_to_add <- paste0(unique(c(original_type, type_to_add)), collapse = "|") + + xml_set_attr(feature_node, attr = "type", value = type_to_add) + xml_set_attr(feature_node, attr = "ref", value = ref_to_add) + + } else { + # else add a new parent node + entity_nodes <- nodes[nodes_idx] + + # here, it is possible that some of the children already are + # nodes. If so, add all nodes with their parent. + + name_nodes_idx <- which(sapply(entity_nodes, function(x) xml_name(xml_parent(x))) == "name") + + original_length <- length(xml_parent(entity_nodes)) + + for (node_idx in seq_along(entity_nodes)) { + + # if there is a name node and if there is more than one parent + # (otherwise, it is possible that all nodes are nested in one single + # name parent which should be kept). + + if (node_idx %in% name_nodes_idx & original_length > 1L) { + node_to_add <- xml_parent(entity_nodes[[node_idx]]) + } else { + node_to_add <- entity_nodes[[node_idx]] + } + + if (node_idx == 1) { + xml_add_parent(.x = node_to_add, .value = entity_name) + first_sibling_node <- node_to_add + } else { + # check if the nodes are already added due to nested annotations earlier. + ids_new <- if (length(xml_children(node_to_add)) > 0) { + xml_attr(xml_children(node_to_add), "id") + } else { + xml_attr(node_to_add, "id") + } + ids_old <- xml_attr(xml_find_all( + x = xml_parent(first_sibling_node), + namespaced_xpath(xml = xml, tags = token_tags) + ), "id") + if (!all(ids_new %in% ids_old)) { + if (length(xml_children(node_to_add)) > 1L) { + cli::cli_alert_warning( + text = "Pre-Annotated Named Entities and added entities overlap. Adding entire `` node to the new annotation. This can result in inprecise presentation of entity spans." + ) + } + xml_add_sibling(.x = first_sibling_node, .value = node_to_add, .copy = FALSE) + } + } + } + + # check if there is already a type in the node + original_type <- xml_attr(x = xml_parent(first_sibling_node), attr = "type") + + if (!is.na(original_type)) { + type_to_add <- paste0(unique(c(type_to_add, original_type)), collapse = "|") + } + + xml_set_attr(xml_parent(first_sibling_node), attr = "type", value = type_to_add) + xml_set_attr(xml_parent(first_sibling_node), attr = "ref", value = ref_to_add) + } + } } } } @@ -95,26 +213,29 @@ xml_enrich <- function(xml, #' function which adds the namespace to the XPATH #' #' @param xml xml the namespace is derived from -#' @param tags the tags which should be queried with the XPATH# +#' @param tags the tags which should be queried with the XPATH +#' @importFrom xml2 xml_ns #' @export namespaced_xpath <- function(xml = xml, tags) { - - xml_namespace <- xml2::xml_ns(xml) - + + xml_namespace <- xml_ns(xml) + if (length(xml_namespace) == 1) { + namespace_nm <- names(xml_namespace) + } else { - + tei_ns_idx <- grep(pattern = "http://www.tei-c.org/ns/1.0", x = xml_namespace) - + if (tei_ns_idx == 1) { namespace_nm <- names(xml_namespace[tei_ns_idx]) } else { stop("Unspecified Namespace") } } - + if (length(tags) == 1) { sprintf(".//%s:%s", namespace_nm, tags) } else { diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd index 9b0d223..df8eb08 100644 --- a/man/get_dbpedia_uris.Rd +++ b/man/get_dbpedia_uris.Rd @@ -92,7 +92,8 @@ get_dbpedia_uris(x, ...) support = 20, expand_to_token = FALSE, drop_inexact_annotations = TRUE, - verbose = TRUE + verbose = if (progress) FALSE else verbose, + progress = FALSE ) } \arguments{ diff --git a/man/namespaced_xpath.Rd b/man/namespaced_xpath.Rd index 21a3254..7dc25c4 100644 --- a/man/namespaced_xpath.Rd +++ b/man/namespaced_xpath.Rd @@ -9,7 +9,7 @@ namespaced_xpath(xml = xml, tags) \arguments{ \item{xml}{xml the namespace is derived from} -\item{tags}{the tags which should be queried with the XPATH#} +\item{tags}{the tags which should be queried with the XPATH} } \description{ function which adds the namespace to the XPATH diff --git a/man/xml_enrich.Rd b/man/xml_enrich.Rd index 40312aa..5aa1389 100644 --- a/man/xml_enrich.Rd +++ b/man/xml_enrich.Rd @@ -10,7 +10,8 @@ xml_enrich( token_tags = c("w", "pc"), entity_name = "name", feature_tag = NULL, - attributes = "dbpedia_uri" + ref = "dbpedia_uri", + type = NULL ) } \arguments{ @@ -29,10 +30,22 @@ describing entities.} \item{feature_tag}{A \verb{character vector}, the name of pre-annotated features to be used and enriched.} -\item{attributes}{A \verb{character vector}, the names of attributes to be added. -Must correspond to column names in \code{annotation_dt}.} +\item{ref}{A \verb{character vector}, the name of the URI to be added. +Must correspond to a column name in \code{annotation_dt}.} + +\item{type}{A \verb{character vector}, the name of the entity type to be added. +Must correspond to a column name in \code{annotation_dt}.} } \description{ Enrich original XML with retrieved Uniform Resource Identifiers and additional information. } +\details{ +If feature_tag is not NULL, then existing entities are enriched when they +entirely overlap with new annotations. In this case, found \code{type}s are added +to the node. If they are not identical, new types are added to previously +annotated types with a pipe, indicating different annotation results. +Regardless of the value in \code{feature_tag}, all annotations in the annotation +data.table are added. To limit the annotation to entities which correspond to +pre-annotated entities, consult the documentation of \code{get_dbpedia_uris()}. +}