Skip to content

Commit

Permalink
XML functionality more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristophLeonhardt committed Apr 1, 2024
1 parent 0768d91 commit 2138fad
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 38 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: dbpedia
Type: Package
Title: R Wrapper for DBpedia Spotlight
Version: 0.1.2.9003
Date: 2024-03-27
Version: 0.1.2.9004
Date: 2024-04-01
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
person("Christoph", "Leonhardt", role = "aut")
Expand Down
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,13 @@ importFrom(stringi,stri_c)
importFrom(tibble,as_tibble)
importFrom(utils,URLencode)
importFrom(xml2,read_xml)
importFrom(xml2,xml_add_parent)
importFrom(xml2,xml_add_sibling)
importFrom(xml2,xml_attr)
importFrom(xml2,xml_children)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_set_attrs)
importFrom(xml2,xml_name)
importFrom(xml2,xml_ns)
importFrom(xml2,xml_parent)
importFrom(xml2,xml_set_attr)
importFrom(xml2,xml_text)
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## dbpedia v0.1.2.9004
* improved verbosity of XML processing by implementing the approach used when processing subcorpus bundles for XML nodes as well
* addressed issue with overlapping annotations in XML, described for CWB corpora in issue #43
* added the possibility to enrich previous named entities without restricting the initial annotation process to them only (I.e. enrich existing and adding new ones is possible now)
* changed encoding of URIs in XML from "dbpedia_uri" to "type" and "ref"
* made the annotation of nested pre-annotated data more robust

## dbpedia v0.1.2.9003
* started introducing functionality to detect and resolve overlaps (see issue #42) with `detect_overlap()` and `categorize_overlap()`
* added `resolve_overlap()` as an (experimental) function to resolve overlaps identified and categorized in `detect_overlap()` and `categorize_overlap()`
Expand Down
21 changes: 16 additions & 5 deletions R/dbpedia.R
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,8 @@ setMethod(
support = 20,
expand_to_token = FALSE,
drop_inexact_annotations = TRUE,
verbose = TRUE
verbose = if (progress) FALSE else verbose,
progress = FALSE
) {

# sometimes, there are nodes of the same name in different parts of the
Expand Down Expand Up @@ -900,7 +901,7 @@ setMethod(
tags = segment))
}

if (verbose) cli_progress_step("preparing {.val {length(nodes_to_process)}} annotation tables.")
if (verbose) cli_progress_step("preparing {.val {length(nodes_to_process)}} segments to process.")

docs <- to_annotation(nodes = nodes_to_process,
xml = x,
Expand Down Expand Up @@ -943,8 +944,15 @@ setMethod(
dt
}

if (progress) {
env <- parent.frame()
cli_progress_bar("Tasks", total = length(docs), type = "tasks", .envir = env)
}

annotations <- lapply(docs, function(doc) {


if (progress) cli_progress_update(.envir = env)

links <- get_dbpedia_uris(
x = doc,
language = language,
Expand All @@ -970,10 +978,11 @@ setMethod(
text = .SD[["text"]],
types = .SD[["types"]]
),
by = "start",
by = c("start", "end"),
.SDcols = c("start", "end", "dbpedia_uri", "text", "types")
]
tab[, "start" := NULL]
tab[, "end" := NULL]

} else {

Expand Down Expand Up @@ -1014,7 +1023,9 @@ setMethod(

}
)


if (progress) cli_progress_done(.envir = env)

data.table::rbindlist(annotations)
})

Expand Down
171 changes: 146 additions & 25 deletions R/xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,32 +13,56 @@
#' describing entities.
#' @param feature_tag A `character vector`, the name of pre-annotated features
#' to be used and enriched.
#' @param attributes A `character vector`, the names of attributes to be added.
#' Must correspond to column names in `annotation_dt`.
#' @param ref A `character vector`, the name of the URI to be added.
#' Must correspond to a column name in `annotation_dt`.
#' @param type A `character vector`, the name of the entity type to be added.
#' Must correspond to a column name in `annotation_dt`.
#' @details
#' If feature_tag is not NULL, then existing entities are enriched when they
#' entirely overlap with new annotations. In this case, found `type`s are added
#' to the node. If they are not identical, new types are added to previously
#' annotated types with a pipe, indicating different annotation results.
#' Regardless of the value in `feature_tag`, all annotations in the annotation
#' data.table are added. To limit the annotation to entities which correspond to
#' pre-annotated entities, consult the documentation of `get_dbpedia_uris()`.
#' @export
#' @importFrom xml2 xml_set_attrs
#' @importFrom xml2 xml_add_parent xml_add_sibling xml_attr xml_add_parent xml_add_sibling xml_children xml_find_all xml_name xml_parent xml_set_attr
xml_enrich <- function(xml,
annotation_dt,
token_tags = c("w", "pc"),
entity_name = "name",
feature_tag = NULL,
attributes = "dbpedia_uri"
ref = "dbpedia_uri",
type = NULL
) {

if (!ref %in% colnames(annotation_dt)) {
cli::cli_alert_warning(text = "{.var ref} is not in {.strong annotation data.table}. No values will be added.")
}

if (!type %in% colnames(annotation_dt)) {
cli::cli_alert_warning(text = "{.var type} is not in {.strong annotation data.table}. No values will be added.")
}

if (is.null(feature_tag)) {
cli::cli_alert_warning(text = "If {.var feature_tag} is NULL, it is assumed that the input data does not contain named entity annotation. If the text is pre-annotated, add the name of the tags used to encode named anntations.")
}

# get all nodes which might contain entities
nodes <- xml2::xml_find_all(
nodes <- xml_find_all(
xml,
xpath = namespaced_xpath(xml = xml, tags = token_tags)
)

node_ids <- xml2::xml_attr(nodes, "id")
node_ids <- xml_attr(nodes, "id")

# for each annotation, extract identified words

for (i in 1:nrow(annotation_dt)) {

# get what to add
attributes_to_add <- sapply(attributes, function(x) annotation_dt[i, ][[x]])
ref_to_add <- annotation_dt[i, ][[ref]]
type_to_add <- annotation_dt[i, ][[type]]

# distinguish between enriched features (named entities, etc.) and enriched
# tokens
Expand All @@ -59,33 +83,127 @@ xml_enrich <- function(xml,

for (node_idx in 1:length(entity_nodes)) {
if (node_idx == 1) {
xml2::xml_add_parent(.x = entity_nodes[[node_idx]], .value = entity_name)
xml_add_parent(.x = entity_nodes[[node_idx]], .value = entity_name)
} else {
xml2::xml_add_sibling(.x = entity_nodes[[1]], .value = entity_nodes[[node_idx]], .copy = FALSE)
xml_add_sibling(.x = entity_nodes[[1]], .value = entity_nodes[[node_idx]], .copy = FALSE)
}
}

# and set attributes to this new parent node
xml_set_attrs(xml2::xml_parent(entity_nodes), attributes_to_add)
xml_set_attr(xml_parent(entity_nodes), attr = "type", value = type_to_add)
xml_set_attr(xml_parent(entity_nodes), attr = "ref", value = ref_to_add)

} else {

# else there is a pre-annotated feature such as named entities

# the ID in the annotation data then points to the first word of the
# entity, with a trailing "_FEATURE-TAG" behind the ID itself.
# there are two scenarios:
# 1: feature tags were used when getting DBpedia URIs previously
# 2: feature tags were not used before but now, should be considered when adding tags

annotation_id <- annotation_dt[i, ][["original_id"]]
# In scenario 1, the ID in the annotation data then points to the first
#word of the entity, with a trailing "_FEATURE-TAG" behind the ID itself.

annotation_id <- annotation_dt[i, ][["original_id"]]
feature_tag_end <- paste0("_", feature_tag, "$")
feature_word_id <- gsub(feature_tag_end, "", annotation_id)

# get first word node
nodes_idx <- which(node_ids %in% feature_word_id)
feature_node <- xml2::xml_parent(nodes[nodes_idx])
if (grepl(pattern = feature_tag_end, x = annotation_id)) {
feature_word_id <- gsub(feature_tag_end, "", annotation_id)

# get first word node
nodes_idx <- which(node_ids %in% feature_word_id)
feature_node <- xml_parent(nodes[nodes_idx])

# check if there is already a type in the node
original_type <- xml_attr(x = feature_node, attr = "type")
type_to_add <- paste0(unique(c(type_to_add, original_type)), collapse = "|")

xml_set_attr(feature_node, attr = "type", value = type_to_add)
xml_set_attr(feature_node, attr = "ref", value = ref_to_add)

} else {

# In scenario 2, all tokens are considered to link. Now it is possible
# that there already are named entities annotated. If the new annotation
# and the existing NE annotation completely overlap, the existing named
# entity should be enriched.

anno_token_ids <- unlist(strsplit(annotation_id, split = "\\|"))
nodes_idx <- which(node_ids %in% anno_token_ids)
feature_node <- xml_parent(nodes[nodes_idx])

xml2::xml_set_attrs(x = feature_node, value = attributes_to_add)
# if they all have the same parent, add attributes to this parent
# (the check whether all children are in the new annotation is important
# for nested entities).

if (length(feature_node) == 1 & all(xml_attr(xml_children(feature_node), "id") %in% anno_token_ids)) {

# check if there is already a type in the node
original_type <- xml_attr(x = feature_node, attr = "type")
type_to_add <- paste0(unique(c(original_type, type_to_add)), collapse = "|")

xml_set_attr(feature_node, attr = "type", value = type_to_add)
xml_set_attr(feature_node, attr = "ref", value = ref_to_add)

} else {
# else add a new parent node
entity_nodes <- nodes[nodes_idx]

# here, it is possible that some of the children already are <name>
# nodes. If so, add all nodes with their <name> parent.

name_nodes_idx <- which(sapply(entity_nodes, function(x) xml_name(xml_parent(x))) == "name")

original_length <- length(xml_parent(entity_nodes))

for (node_idx in seq_along(entity_nodes)) {

# if there is a name node and if there is more than one parent
# (otherwise, it is possible that all nodes are nested in one single
# name parent which should be kept).

if (node_idx %in% name_nodes_idx & original_length > 1L) {
node_to_add <- xml_parent(entity_nodes[[node_idx]])
} else {
node_to_add <- entity_nodes[[node_idx]]
}

if (node_idx == 1) {
xml_add_parent(.x = node_to_add, .value = entity_name)
first_sibling_node <- node_to_add
} else {
# check if the nodes are already added due to nested annotations earlier.
ids_new <- if (length(xml_children(node_to_add)) > 0) {
xml_attr(xml_children(node_to_add), "id")
} else {
xml_attr(node_to_add, "id")
}
ids_old <- xml_attr(xml_find_all(
x = xml_parent(first_sibling_node),
namespaced_xpath(xml = xml, tags = token_tags)
), "id")
if (!all(ids_new %in% ids_old)) {
if (length(xml_children(node_to_add)) > 1L) {
cli::cli_alert_warning(
text = "Pre-Annotated Named Entities and added entities overlap. Adding entire `<name>` node to the new annotation. This can result in inprecise presentation of entity spans."
)
}
xml_add_sibling(.x = first_sibling_node, .value = node_to_add, .copy = FALSE)
}
}
}

# check if there is already a type in the node
original_type <- xml_attr(x = xml_parent(first_sibling_node), attr = "type")

if (!is.na(original_type)) {
type_to_add <- paste0(unique(c(type_to_add, original_type)), collapse = "|")
}

xml_set_attr(xml_parent(first_sibling_node), attr = "type", value = type_to_add)
xml_set_attr(xml_parent(first_sibling_node), attr = "ref", value = ref_to_add)
}
}
}
}
}
Expand All @@ -95,26 +213,29 @@ xml_enrich <- function(xml,
#' function which adds the namespace to the XPATH
#'
#' @param xml xml the namespace is derived from
#' @param tags the tags which should be queried with the XPATH#
#' @param tags the tags which should be queried with the XPATH
#' @importFrom xml2 xml_ns
#' @export
namespaced_xpath <- function(xml = xml, tags) {
xml_namespace <- xml2::xml_ns(xml)

xml_namespace <- xml_ns(xml)

if (length(xml_namespace) == 1) {

namespace_nm <- names(xml_namespace)

} else {

tei_ns_idx <- grep(pattern = "http://www.tei-c.org/ns/1.0",
x = xml_namespace)

if (tei_ns_idx == 1) {
namespace_nm <- names(xml_namespace[tei_ns_idx])
} else {
stop("Unspecified Namespace")
}
}

if (length(tags) == 1) {
sprintf(".//%s:%s", namespace_nm, tags)
} else {
Expand Down
3 changes: 2 additions & 1 deletion man/get_dbpedia_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/namespaced_xpath.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 16 additions & 3 deletions man/xml_enrich.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2138fad

Please sign in to comment.