From a52d1fbd66e67b19a42ce6c56e2b03afcbc4ada3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 26 Feb 2024 16:12:53 +0100 Subject: [PATCH] pipes dropped for oldrel compatibility #38 --- NAMESPACE | 1 + R/dbpedia.R | 65 ++++++++++++++++++++--------------------- R/utils.R | 19 +++++------- R/wikidata.R | 15 +++++++--- R/xml.R | 17 ++++++----- man/get_dbpedia_uris.Rd | 5 ++-- man/wikidata_uris.Rd | 15 +++++++--- 7 files changed, 75 insertions(+), 62 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 619ae60..7719f46 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,6 +57,7 @@ importFrom(tibble,as_tibble) importFrom(utils,URLencode) importFrom(xml2,read_xml) importFrom(xml2,xml_attr) +importFrom(xml2,xml_children) importFrom(xml2,xml_find_all) importFrom(xml2,xml_set_attrs) importFrom(xml2,xml_text) diff --git a/R/dbpedia.R b/R/dbpedia.R index 55116e9..8f7d89c 100644 --- a/R/dbpedia.R +++ b/R/dbpedia.R @@ -155,6 +155,7 @@ as_annotation <- function(x){ #' @param feature_tag ... #' @importFrom stringi stri_c #' @importFrom NLP Annotation +#' @importFrom xml2 xml_children to_annotation = function(nodes, xml, token_tags, feature_tag) { if (inherits(nodes, "xml_nodeset")) { @@ -169,8 +170,10 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) { } else { - token_elements <- nodes |> - xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = token_tags)) + token_elements <- xml2::xml_find_all( + nodes, + xpath = namespaced_xpath(xml = xml, tags = token_tags) + ) # make token annotation data annotation @@ -210,9 +213,10 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) { # data.frame split to rwos token_feat_dataframe <- data.frame(word = toks, id = tok_ids) - token_feat_list <- split(token_feat_dataframe, seq(nrow(token_feat_dataframe))) |> - unname() - + token_feat_list <- unname( + split(token_feat_dataframe, seq(nrow(token_feat_dataframe))) + ) + token_annotation <- NLP::Annotation( seq_along(tok_ids), # IDs must be integer, which is a bit unfortunate rep("word", length(tok_ids)), @@ -224,49 +228,45 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) { # and add feature elements if chosen if (!is.null(feature_tag)) { - feature_elements <- nodes |> - xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = feature_tag)) + feature_elements <- xml2::xml_find_all( + nodes, + xpath = namespaced_xpath(xml = xml, tags = feature_tag) + ) } else { feature_elements <- NULL } if (length(feature_elements) > 0) { - - feature_ids <- sapply(feature_elements, function(element) { - xml2::xml_find_first(element, - xpath = namespaced_xpath(xml = xml, tags = token_tags)) |> - xml2::xml_attr("id") - } - ) + feature_ids <- sapply( + feature_elements, + function(element) { + el <- xml2::xml_find_first( + element, + xpath = namespaced_xpath(xml = xml, tags = token_tags) + ) + xml2::xml_attr(el, "id") + }) feature_ids <- sprintf("%s_%s", feature_ids, feature_tag) # get attributes of features feature_ids <- feature_ids # name has no ID. We use the first word ID (assuming that there are no overlaps?) feature_kinds <- xml2::xml_attr(feature_elements, "type") - feature_texts <- sapply(feature_elements, function(feat) { - xml2::xml_children(feat) |> - xml2::xml_text() |> - paste(collapse = " ") - } + feature_texts <- sapply( + feature_elements, + function(feat) paste(xml_text(xml_children(feat)), collapse = " ") ) # get spans for features - entity_spans <- sapply(feature_elements, function(element) { - child_id <- element |> - xml2::xml_children() |> - xml2::xml_attr("id") - + entity_spans <- t(sapply(feature_elements, function(element) { + child_id <- xml_attr(xml_children(element), "id") child_idx <- which(tok_ids %in% child_id) child_start <- min(start_positions[child_idx]) child_end <- max(end_positions[child_idx]) - - matrix(c(child_start, child_end), nrow = 1, ncol = 2) - - } - ) |> t() + matrix(c(child_start, child_end), nrow = 1L, ncol = 2L) + })) feature_annotation <- NLP::Annotation( @@ -300,7 +300,7 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) { # make string word_with_ws <- paste(toks, ifelse(is.na(tok_joins), " ", ""), sep = "") - s <- stringi::stri_c(word_with_ws, collapse = "") |> trimws() + s <- trimws(stringi::stri_c(word_with_ws, collapse = "")) # add segment id as metadata (should work if segment is NULL as the TEI has # an ID as well). @@ -705,13 +705,12 @@ setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOpti #' #' # Process quanteda corpus #' library(quanteda) -#' uritab <- data_char_ukimmig2010 |> -#' corpus() |> +#' uritab <- data_char_ukimmig2010 %>% +#' corpus() %>% #' get_dbpedia_uris( #' verbose = FALSE, #' config = httr::config(http_version = 1.1) #' ) -#' #' @rdname get_dbpedia_uris setMethod( "get_dbpedia_uris", diff --git a/R/utils.R b/R/utils.R index 8e7a5cd..63513d8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -222,23 +222,20 @@ map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE # types is a list of lists. Transform to single character vector. type_list <- unlist(types, recursive = FALSE) - types_with_class <- lapply(seq_along(type_list), function(i) { + types_with_class_raw <- lapply(seq_along(type_list), function(i) { list_name <- names(type_list)[[i]] list_elements <- type_list[[i]] paste0(list_name, ":", list_elements) - }) |> - unlist() |> - intersect(mapping_vector) + }) + types_with_class <- intersect(unlist(types_with_class_raw), mapping_vector) - if (length(types_with_class) > 0) { + if (length(types_with_class) > 0L) { match_idx <- which(mapping_vector %in% types_with_class) - class_name <- mapping_vector |> - names() |> - _[match_idx] |> - unique() |> - sort() |> - paste(collapse = "|") + class_name <- paste( + sort(unique(names(mapping_vector)[match_idx])), + collapse = "|" + ) } else { class_name <- other diff --git a/R/wikidata.R b/R/wikidata.R index 5bc9d09..5ca79c7 100644 --- a/R/wikidata.R +++ b/R/wikidata.R @@ -263,10 +263,17 @@ setGeneric( #' #' httr::set_config(httr::config(ssl_verifypeer = 0L)) #' -#' uritab <- data_char_ukimmig2010 |> -#' corpus() |> -#' get_dbpedia_uris(progress = TRUE) %>% -#' add_wikidata_uris(endpoint = "https://dbpedia.org/sparql/", progress = TRUE, chunksize = 100) %>% +#' uritab <- data_char_ukimmig2010 %>% +#' corpus() %>% +#' get_dbpedia_uris( +#' progress = TRUE, +#' config = httr::config(http_version = 1.1) +#' ) %>% +#' add_wikidata_uris( +#' endpoint = "https://dbpedia.org/sparql/", +#' progress = TRUE, +#' chunksize = 100 +#' ) %>% #' wikidata_query(id = "P31") #' } #' diff --git a/R/xml.R b/R/xml.R index c53ea4b..0e423c3 100644 --- a/R/xml.R +++ b/R/xml.R @@ -26,11 +26,12 @@ xml_enrich <- function(xml, ) { # get all nodes which might contain entities - nodes <- xml |> - xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = token_tags)) + nodes <- xml2::xml_find_all( + xml, + xpath = namespaced_xpath(xml = xml, tags = token_tags) + ) - node_ids <- nodes |> - xml2::xml_attr("id") + node_ids <- xml2::xml_attr(nodes, "id") # for each annotation, extract identified words @@ -46,9 +47,11 @@ xml_enrich <- function(xml, # if there is no feature tag, pre-annotated named entities weren't # provided. Add identified named entities to tokens. - annotation_id <- annotation_dt[i, ][["original_id"]] |> - strsplit(split = "\\|") |> - unlist() + annotation_id <- unlist(strsplit( + annotation_dt[i, ][["original_id"]], + split = "\\|" + ) + ) # there could be additional values such as the type? nodes_idx <- which(node_ids %in% annotation_id) diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd index 573f6ca..e4192ac 100644 --- a/man/get_dbpedia_uris.Rd +++ b/man/get_dbpedia_uris.Rd @@ -227,11 +227,10 @@ uritab <- corpus("REUTERS") \%>\% # Process quanteda corpus library(quanteda) -uritab <- data_char_ukimmig2010 |> - corpus() |> +uritab <- data_char_ukimmig2010 \%>\% + corpus() \%>\% get_dbpedia_uris( verbose = FALSE, config = httr::config(http_version = 1.1) ) - } diff --git a/man/wikidata_uris.Rd b/man/wikidata_uris.Rd index 991d1c5..1dfad0c 100644 --- a/man/wikidata_uris.Rd +++ b/man/wikidata_uris.Rd @@ -80,10 +80,17 @@ options(dbpedia.endpoint = "http://api.dbpedia-spotlight.org/en/annotate") httr::set_config(httr::config(ssl_verifypeer = 0L)) -uritab <- data_char_ukimmig2010 |> - corpus() |> - get_dbpedia_uris(progress = TRUE) \%>\% - add_wikidata_uris(endpoint = "https://dbpedia.org/sparql/", progress = TRUE, chunksize = 100) \%>\% +uritab <- data_char_ukimmig2010 \%>\% + corpus() \%>\% + get_dbpedia_uris( + progress = TRUE, + config = httr::config(http_version = 1.1) + ) \%>\% + add_wikidata_uris( + endpoint = "https://dbpedia.org/sparql/", + progress = TRUE, + chunksize = 100 + ) \%>\% wikidata_query(id = "P31") }