Skip to content

Commit

Permalink
xml_enrich works for pre-annotated data and minor adjustments
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristophLeonhardt committed Feb 21, 2024
1 parent 7bea856 commit 128cc90
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 71 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: dbpedia
Type: Package
Title: R Wrapper for DBpedia Spotlight
Version: 0.1.1.9010
Date: 2024-02-16
Version: 0.1.1.9011
Date: 2024-02-21
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
person("Christoph", "Leonhardt", role = "aut")
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## dbpedia v0.1.1.9011
* `xml_enrich()` now adds new attributes to pre-annotated features
* `get_dbpedia_uris()` method now includes argument `expand_to_token` for subcorpus_bundles as well
* `map_types_to_class()` works with the list representation in the types column

## dbpedia v0.1.1.9010
* new functions `to_annotation()`, `xml_enrich()` `namespaced_xpath()` and method `get_dbpedia_uris()` for xml docs.

Expand Down
3 changes: 2 additions & 1 deletion R/dbpedia.R
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
#' uritab <- corpus("REUTERS") %>%
#' split(s_attribute = "id", verbose = FALSE) %>%
#' get_dbpedia_uris(language = "en", p_attribute = "word", verbose = TRUE)
setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), max_len = 5600L, verbose = TRUE, progress = FALSE){
setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), max_len = 5600L, expand_to_token = FALSE, verbose = TRUE, progress = FALSE){

if (progress){
env <- parent.frame()
Expand All @@ -584,6 +584,7 @@ setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOpti
max_len = max_len,
confidence = confidence,
api = api,
expand_to_token = expand_to_token,
verbose = if (progress) FALSE else verbose
)
}
Expand Down
28 changes: 22 additions & 6 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,23 @@ unique_msg <- function(x, verbose = TRUE){

#' Map types returned by DBpedia Spotlight to a limited set of classes
#'
#' This function takes the output of `get_dbpedia_uris()` and compares values in
#' the `types` column with a named character vector. The main purpose of this
#' function is to reduce the number of types to a limited set of classes.
#'
#' @param x A `data.table` with DBpedia URIs.
#' @param mapping_vector A `named character vector` with desired class names
#' (as names) and types from the DBpedia ontology as values.
#' @param other a `character vector` with the name of the class of all types
#' not matched by the `mapping_vector`.
#' @param mapping_vector A `named character vector` with desired class names (as
#' names) and types from the DBpedia ontology as values. For example:
#' c("PERSON" = "DBpedia:Person"). Can contain more than one pair of class and
#' type.
#' @param other a `character vector` with the name of the class of all types not
#' matched by the `mapping_vector`.
#' @param verbose A `logical` value - whether to display messages.
#' @importFrom data.table is.data.table
#' @importFrom cli format_error cli_alert_info
#' @details If there is more than one match between the retrieved types and the
#' `mapping vector`, unique classes are sorted alphabetically and collapsed.
#' @return Function adds classes to input data.table by reference.
#' @export
map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE) {

Expand Down Expand Up @@ -220,8 +229,14 @@ map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE

types_to_class_fun <- function(types) {

types_with_class <- types |>
strsplit(split = ",") |>
# types is a list of lists. Transform to single character vector.
type_list <- unlist(types, recursive = FALSE)

types_with_class <- lapply(seq_along(type_list), function(i) {
list_name <- names(type_list)[[i]]
list_elements <- type_list[[i]]
paste0(list_name, ":", list_elements)
}) |>
unlist() |>
intersect(mapping_vector)

Expand All @@ -232,6 +247,7 @@ map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE
names() |>
_[match_idx] |>
unique() |>
sort() |>
paste(collapse = "|")

} else {
Expand Down
109 changes: 61 additions & 48 deletions R/xml.R
Original file line number Diff line number Diff line change
@@ -1,75 +1,88 @@
#' Add entities to original XML
#'
#' @param xml ...
#' @param annotation_dt ...
#' @param entity_tags ...
#' @param entity_name ...
#' @param token_tags ...
#' @param attributes ...
#'
#' Enrich original XML with retrieved Uniform Resource Identifiers and
#' additional information.
#'
#' @param xml The XML document to be modified in place.
#' @param annotation_dt A `data.frame` or `data.table` of annotations returned
#' by `get_dbpedia_uris()`. Must contain token IDs and values to be added
#' (e.g. URIs).
#' @param token_tags A `character vector`, the names of XML nodes containing
#' tokens, i.e. which potentially can be (part of) entities.
#' @param entity_name A `character vector`, the name of added XML nodes
#' describing entities.
#' @param feature_tag A `character vector`, the name of pre-annotated features
#' to be used and enriched.
#' @param attributes A `character vector`, the names of attributes to be added.
#' Must correspond to column names in `annotation_dt`.
#' @export
#' @importFrom xml2 xml_set_attrs
xml_enrich <- function(xml,
annotation_dt,
entity_tags = c("w", "pc", "name"),
entity_name = "name",
token_tags = c("w", "pc"),
entity_name = "name",
feature_tag = NULL,
attributes = "dbpedia_uri"
) {

# get all nodes which might contain entities
nodes <- xml |>
xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = entity_tags))
xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = token_tags))

node_ids <- nodes |>
xml2::xml_attr("id")

# for each annotation, extract identified words

for (i in 1:nrow(annotation_dt)) {

# in theory, an annotation can comprise more than one word
annotation_id <- annotation_dt[i, ][["original_id"]] |>
strsplit(split = "\\|") |>
unlist()


# get what to add
attributes_to_add <- sapply(attributes, function(x) annotation_dt[i, ][[x]])

# there could be additional values such as the type?
nodes_idx <- which(node_ids %in% annotation_id)

entity_nodes <- nodes[nodes_idx]

# this assumes that an entity always describes continuous spans which is
# plausible.

# check if nodes are words (or punctuation) or already something like an
# entity. If it is already an entity, an attribute should be added. If not,
# a new parent node should be added.

if (all(xml2::xml_name(entity_nodes) %in% token_tags)) {


# distinguish between enriched features (named entities, etc.) and enriched
# tokens

if (is.null(feature_tag)) {
# if there is no feature tag, pre-annotated named entities weren't
# provided. Add identified named entities to tokens.

annotation_id <- annotation_dt[i, ][["original_id"]] |>
strsplit(split = "\\|") |>
unlist()

# there could be additional values such as the type?
nodes_idx <- which(node_ids %in% annotation_id)
entity_nodes <- nodes[nodes_idx]

for (node_idx in 1:length(entity_nodes)) {
if (node_idx == 1) {
xml2::xml_add_parent(.x = entity_nodes[[node_idx]], .value = entity_name)
} else {
xml2::xml_add_sibling(.x = entity_nodes[[1]], .value = entity_nodes[[node_idx]], .copy = FALSE)
}
}

# and set attributes to this new parent node
xml_set_attrs(xml2::xml_parent(entity_nodes), attributes_to_add)

} else if (entity_nodes == entity_name) {

# this assumes that the new nodes should be the same name as the old
# nodes. Are there scenarios in which this does not hold true?

# if it is a named entity already, set attribute here
# Does this work if there are multiple nodes here?

xml_set_attrs(entity_nodes, attributes_to_add)


} else {

# else there is a pre-annotated feature such as named entities

# the ID in the annotation data then points to the first word of the
# entity, with a trailing "_FEATURE-TAG" behind the ID itself.

annotation_id <- annotation_dt[i, ][["original_id"]]

feature_tag_end <- paste0("_", feature_tag, "$")
feature_word_id <- gsub(feature_tag_end, "", annotation_id)

# get first word node
nodes_idx <- which(node_ids %in% feature_word_id)
feature_node <- xml2::xml_parent(nodes[nodes_idx])

xml2::xml_set_attrs(x = feature_node, value = attributes_to_add)

}
}
}
Expand All @@ -81,7 +94,7 @@ xml_enrich <- function(xml,
#' @param xml xml the namespace is derived from
#' @param tags the tags which should be queried with the XPATH#
#' @export
namespaced_xpath = function(xml = xml, tags) {
namespaced_xpath <- function(xml = xml, tags) {

xml_namespace <- xml2::xml_ns(xml)

Expand Down
1 change: 1 addition & 0 deletions man/get_dbpedia_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 16 additions & 5 deletions man/map_types_to_class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 16 additions & 9 deletions man/xml_enrich.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 128cc90

Please sign in to comment.