From e3f3c41006a7d928b6eb44d4bf25b523e3f8f98e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 31 Mar 2024 20:58:20 +0200 Subject: [PATCH] continue after http error abort #47 --- DESCRIPTION | 4 +- NEWS.md | 11 + R/dbpedia.R | 437 ++++++++++++++++++++++++++-------------- man/get_dbpedia_uris.Rd | 16 ++ 4 files changed, 318 insertions(+), 150 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e32990..7924ee9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: dbpedia Type: Package Title: R Wrapper for DBpedia Spotlight -Version: 0.1.2.9003 -Date: 2024-03-27 +Version: 0.1.2.9005 +Date: 2024-03-31 Authors@R: c( person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")), person("Christoph", "Leonhardt", role = "aut") diff --git a/NEWS.md b/NEWS.md index 5fe269a..f445f8f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,21 +1,32 @@ +## dbpedia v0.1.2.9004 + +* Method `get_dbpedia_uris()` has new argument `retry` to retry if API is stalled #45 and new argument `logfile` for tracking and debugging longrunning annotation tasks. If the annotation failes, `NULL` is returned (no abort). + + ## dbpedia v0.1.2.9003 + * started introducing functionality to detect and resolve overlaps (see issue #42) with `detect_overlap()` and `categorize_overlap()` * added `resolve_overlap()` as an (experimental) function to resolve overlaps identified and categorized in `detect_overlap()` and `categorize_overlap()` * introduced tests for `detect_overlap()` and `categorize_overlap()` * modified test suite to follow recommendations of "R Packages" (2nd edition) by Wickham and Bryan, in particular by using `withr` for self-contained tests + ## dbpedia v0.1.2.9002 + * `expand_to_token` of `get_dbpedia_uris()` also expands spans to the left now (#44) * added `end` to data.table grouping in `get_dbpedia_uris()` for subcorpora to address issue #43. This avoids processing multiple entities at the same time. * reorganized tests via `testthat`, i.e. removed `context` and renamed files to start with "test-" * added test for `expand_to_token` argument + ## dbpedia v0.1.2.9001 + * `entity_types_map()` now creates assignments again (#40) and returns them as character vectors * `entity_types_map()` also passes all arguments when used with data.table objects * `types_src` works in `get_dbpedia_uris()` for documents with a single type (#41) * messages for `types_src` follow verbosity set by the argument `verbose` + ## dbpedia v0.1.2 * `get_dbpedia_uris()` has new argument `types` to filter results. * `dbpedia_spotlight_status()` without warnings if docker not available / not running #32. diff --git a/R/dbpedia.R b/R/dbpedia.R index 7dc9bef..e5a6b6a 100644 --- a/R/dbpedia.R +++ b/R/dbpedia.R @@ -318,7 +318,10 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) { #' @rdname get_dbpedia_uris -setGeneric("get_dbpedia_uris", function(x, ...) standardGeneric("get_dbpedia_uris")) +setGeneric( + "get_dbpedia_uris", + function(x, ...) standardGeneric("get_dbpedia_uris") +) #' @exportMethod get_dbpedia_uris #' @rdname get_dbpedia_uris @@ -359,141 +362,209 @@ setMethod( max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, types_src = c("DBpedia", "Wikidata"), verbose = TRUE ) { - - if (nchar(x) > max_len) { - if (verbose) cli_alert_warning( - "input text has length {nchar(x)}, truncate to max_len ({.val {max_len}})" - ) - x <- substr(x, 1L, max_len) - } - - if (!is.numeric(support) | !(length(support) == 1)) { - cli_alert_warning("argument `support` required to be a numeric value") - } - - if (verbose) cli_progress_step("send request to DBpedia Spotlight") - request <- httr::GET( - url = api, - query = c( - list( - text = x, - support = as.character(support), - confidence = confidence - ), - if (length(types) == 0L) - list() - else - list(types = paste(types, collapse = ",")) - ), - httr::add_headers('Accept' = 'application/json') - ) - - if (httr::http_error(request)) { - cli_alert_danger("http error response") - stop("abort") - } - - if (verbose) cli_progress_step("parse result") - txt <- httr::content(request, as = "text", encoding = "UTF-8") - json <- jsonlite::fromJSON(txt) - resources <- as.data.table(json[["Resources"]]) - - if (nrow(resources) == 0L) { - return( - data.table( - start = integer(), - text = character(), - dbpedia_uri = character(), - types = character() + + if (nchar(x) > max_len) { + if (verbose) cli_alert_warning( + "input text has length {nchar(x)}, truncate to max_len ({.val {max_len}})" ) + x <- substr(x, 1L, max_len) + } + + if (!is.numeric(support) | !(length(support) == 1)) { + cli_alert_warning("argument `support` required to be a numeric value") + } + + dt_empty <- data.table( + start = integer(), + text = character(), + dbpedia_uri = character(), + types = character() ) - } - - resources_min <- resources[, c("@URI", "@surfaceForm", "@offset", "@types")] - setnames( - resources_min, - old = c("@URI", "@surfaceForm", "@offset", "@types"), - new = c("dbpedia_uri", "text", "start", "types") - ) - setcolorder(resources_min, c("start", "text", "dbpedia_uri", "types")) - - resources_min[, "start" := as.integer(resources_min[["start"]]) + 1L] - - # See issue 41. - types_list <- strsplit(x = resources_min[["types"]], split = ",") - - resources_min[, "types" := lapply( - types_list, - function(x) { - if (length(x) == 0L) return(list()) - spl <- strsplit(x, split = ":") - types <- split( - x = unlist(lapply(spl, `[`, 2L)), - f = unlist(lapply(spl, `[`, 1L)) + + if (verbose) cli_progress_step("send request to DBpedia Spotlight") + request_max <- if (is.logical(retry)) as.integer(request) else retry + request_number <- 1L + proceed <- TRUE + + while (proceed) { + if (!is.null(logfile)){ + cat( + sprintf("[%s] %s\n", Sys.time(), substr(x, 1, 50)), + file = logfile, + append = TRUE + ) + } + + request <- httr::GET( + url = api, + query = c( + list( + text = x, + support = as.character(support), + confidence = confidence + ), + if (length(types) == 0L) + list() + else + list(types = paste(types, collapse = ",")) + ), + httr::add_headers('Accept' = 'application/json') ) - if (length(types) == 1L & length(types_list) == 1L) { - list(types) + + if (httr::http_error(request)) { + cli_alert_danger("http error response") + if (request_number <= request_max){ + if (!is.null(logfile)){ + cat(x, file = logfile, append = TRUE) + cat("\n", file = logfile, append = TRUE) + cat( + sprintf("request %d failed, waiting for retry\n", request_number), + file = logfile, + append = TRUE + ) + docker_stats <- system2( + command = "docker", + args = c("stats", "--no-stream"), stdout = TRUE + ) + cat( + paste(docker_stats, collapse = "\n"), + file = logfile, append = TRUE + ) + cat("\n", file = logfile, append = TRUE) + + gc_output <- capture.output(gc()) + cat( + paste(gc_output, collapse = "\n"), + file = logfile, append = TRUE + ) + cat("\n", file = logfile, append = TRUE) + } + cli_alert_info("http error response - waiting and trying again") + Sys.sleep(1) + request_number <- request_number + 1L + next + } else { + if (!is.null(logfile)){ + cat( + "Unable to process chunk, moving on with next chunk \n", + file = logfile, append = TRUE + ) + } + warning( + sprintf("retried request %d times - returning NULL", request_number) + ) + # old school warning, so that it is available calling `warnings()` + return(NULL) + } } else { - types + # request successful, no need to continue + proceed <- FALSE } } - )] - - if (length(types_src) > 0L) { - src_all <- unique(unlist(lapply(resources_min[["types"]], names))) - src_unused <- setdiff(src_all, types_src) - if (length(src_unused) > 0L & isTRUE(verbose)) - cli_alert_info( - "dropping available types from: {paste(src_unused, collapse = ' / ')}" - ) - for (src in types_src){ - types_vec <- unlist(lapply( - lapply(resources_min[["types"]], `[[`, src), - function(x) { - if (is.null(x)) - NA_character_ - else - sprintf("|%s|", paste(x, collapse = "|")) + + if (verbose) cli_progress_step("parse result") + txt <- httr::content(request, as = "text", encoding = "UTF-8") + json <- jsonlite::fromJSON(txt) + resources <- as.data.table(json[["Resources"]]) + + if (nrow(resources) == 0L) return(dt_empty) + + resources_min <- resources[, c("@URI", "@surfaceForm", "@offset", "@types")] + setnames( + resources_min, + old = c("@URI", "@surfaceForm", "@offset", "@types"), + new = c("dbpedia_uri", "text", "start", "types") + ) + setcolorder(resources_min, c("start", "text", "dbpedia_uri", "types")) + + resources_min[, "start" := as.integer(resources_min[["start"]]) + 1L] + + # See issue 41. + types_list <- strsplit(x = resources_min[["types"]], split = ",") + + resources_min[, "types" := lapply( + types_list, + function(x) { + if (length(x) == 0L) return(list()) + spl <- strsplit(x, split = ":") + types <- split( + x = unlist(lapply(spl, `[`, 2L)), + f = unlist(lapply(spl, `[`, 1L)) + ) + if (length(types) == 1L & length(types_list) == 1L) { + list(types) + } else { + types } - )) - - resources_min[, (paste(src, "type", sep = "_")) := types_vec] + } + )] + + if (length(types_src) > 0L) { + src_all <- unique(unlist(lapply(resources_min[["types"]], names))) + src_unused <- setdiff(src_all, types_src) + if (length(src_unused) > 0L & isTRUE(verbose)) + cli_alert_info( + "dropping available types from: {paste(src_unused, collapse = ' / ')}" + ) + for (src in types_src){ + types_vec <- unlist(lapply( + lapply(resources_min[["types"]], `[[`, src), + function(x) { + if (is.null(x)) + NA_character_ + else + sprintf("|%s|", paste(x, collapse = "|")) + } + )) + + resources_min[, (paste(src, "type", sep = "_")) := types_vec] + } } + + resources_min } - - resources_min -}) +) #' @exportMethod get_dbpedia_uris #' @rdname get_dbpedia_uris -setMethod("get_dbpedia_uris", - "AnnotatedPlainTextDocument", - function(x, - language = getOption("dbpedia.lang"), - max_len = 5600L, - confidence = 0.35, - api = getOption("dbpedia.endpoint"), - types = character(), - support = 20, - verbose = TRUE) { - - get_dbpedia_uris( - x = as.character(x[["content"]]), - language = language, - max_len = max_len, - confidence = confidence, - api = api, - types = types, - support = support, - verbose = verbose - ) - }) +setMethod( + "get_dbpedia_uris", + "AnnotatedPlainTextDocument", + function( + x, + language = getOption("dbpedia.lang"), + max_len = 5600L, + confidence = 0.35, + api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, + types = character(), + support = 20, + verbose = TRUE + ) { + + get_dbpedia_uris( + x = as.character(x[["content"]]), + language = language, + max_len = max_len, + confidence = confidence, + api = api, + retry = retry, + logfile = logfile, + types = types, + support = support, + verbose = verbose + ) + } +) #' Get DBpedia links. #' @@ -519,6 +590,8 @@ setMethod("get_dbpedia_uris", #' @param confidence A `numeric` value, the minimum similarity score that serves #' as threshold before DBpedia Spotlight includes a link into the report. #' @param api An URL of the DBpedia Spotlight API. +#' @param retry A `logical` value, whether to retry in case of a http error. +#' @param logfile Filename for writing logs (e.g. for debugging purposes). #' @param types A `character` vector to restrict result returned to certain #' entity types, such as 'Company' or 'Organization'. If the `character` #' vector is empty (default), no restrictions are applied. @@ -548,6 +621,7 @@ setMethod("get_dbpedia_uris", #' - *types*: Recognized entity types, for each row a named list, if available #' entries such as 'DBpedia', 'Schema', 'Wikidata', 'DUL'. #' Depending on the input object, further columns may be available. +#' If the request to the endpoint failes, `NULL` is returned. #' @exportMethod get_dbpedia_uris #' @importFrom cli cli_alert_warning cli_progress_step cli_alert_danger #' cli_progress_done cli_alert_info @@ -582,9 +656,36 @@ setMethod("get_dbpedia_uris", #' subset(p_type == "speech") %>% #' get_dbpedia_uris(language = "de", s_attribute = "ne", max_len = 5067) #' -setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), types = character(), support = 20, expand_to_token = FALSE, drop_inexact_annotations = TRUE, verbose = TRUE) { +setMethod( + "get_dbpedia_uris", + "subcorpus", + function( + x, + language = getOption("dbpedia.lang"), + p_attribute = "word", + s_attribute = NULL, + max_len = 5600L, + confidence = 0.35, + api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, + types = character(), + support = 20, + expand_to_token = FALSE, + drop_inexact_annotations = TRUE, + verbose = TRUE + ) { - if (verbose) cli_progress_step("convert input to `AnnotatedPlainTextDocument`") + # empty data.table/result if nothing to be mapped or nothing mapped + dt_empty <- data.table( + start = integer(), + text = character(), + dbpedia_uri = character(), + types = character() + ) + + if (verbose) + cli_progress_step("convert input to `AnnotatedPlainTextDocument`") doc <- decode( x, to = "AnnotatedPlainTextDocument", @@ -604,10 +705,14 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp max_len = max_len, confidence = confidence, api = api, + retry = retry, + logfile = logfile, types = types, support = support, verbose = verbose ) + + if (is.null(links)) return(dt_empty) # prepare function to assign cpos_right depending on value and arguments expand_fun = function(.SD, direction) { @@ -649,16 +754,8 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp } else { dt <- as.data.table(doc, what = s_attribute) - if (nrow(dt) == 0){ # if there are no elements of s_attribute #23 - return( - data.table( - start = integer(), - text = character(), - dbpedia_uri = character(), - types = character() - ) - ) - } + # if there are no elements of s_attribute #23 + if (nrow(dt) == 0) return(dt_empty) tab <- links[dt, on = c("start", "text")] @@ -687,7 +784,10 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp tab[["end"]] <- NULL tab[["id"]] <- NULL - setcolorder(x = tab, neworder = c("cpos_left", "cpos_right", "dbpedia_uri", "text", "types")) + setcolorder( + x = tab, + neworder = c("cpos_left", "cpos_right", "dbpedia_uri", "text", "types") + ) if (verbose) { lapply( @@ -710,8 +810,12 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp # drop entities which cannot be mapped exactly to the tokenstream from the # output (see issues #26, #44). if (isTRUE(drop_inexact_annotations) & (any(is.na(tab[["cpos_right"]])) | any(is.na(tab[["cpos_left"]])))) { - missing_cpos_idx <- unique(c(which(is.na(tab[["cpos_right"]])), which(is.na(tab[["cpos_left"]])))) - cli_alert_warning("Cannot map {length(missing_cpos_idx)} entit{?y/ies} exactly to tokenstream. Dropping {?it/them} from the annotation.") + missing_cpos_idx <- unique( + c(which(is.na(tab[["cpos_right"]])), which(is.na(tab[["cpos_left"]]))) + ) + cli_alert_warning( + "Cannot map {length(missing_cpos_idx)} entit{?y/ies} exactly to tokenstream. Dropping {?it/them} from the annotation." + ) tab <- tab[-missing_cpos_idx, ] } @@ -730,7 +834,25 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp #' p_attribute = "word", #' verbose = TRUE #' ) -setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), types = character(), support = 20, max_len = 5600L, expand_to_token = FALSE, verbose = TRUE, progress = FALSE) { +setMethod( + "get_dbpedia_uris", + "subcorpus_bundle", + function( + x, + language = getOption("dbpedia.lang"), + p_attribute = "word", + s_attribute = NULL, + confidence = 0.35, + api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, + types = character(), + support = 20, + max_len = 5600L, + expand_to_token = FALSE, + verbose = TRUE, + progress = FALSE + ) { if (progress) { env <- parent.frame() @@ -748,6 +870,8 @@ setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOpti max_len = max_len, confidence = confidence, api = api, + retry = retry, + logfile = logfile, types = types, support = support, expand_to_token = expand_to_token, @@ -757,10 +881,10 @@ setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOpti ) if (progress) cli_progress_done(.envir = env) - y <- rbindlist(li) + # we use fill = TRUE, because columns for types may be missing in empty lists + y <- rbindlist(li, fill = TRUE) setorderv(y, cols = "cpos_left", order = 1L) - - + if (verbose) { if (!is.null(s_attribute)) { cli_alert_info( @@ -793,6 +917,8 @@ setMethod( max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, verbose = TRUE, @@ -827,6 +953,8 @@ setMethod( max_len = max_len, confidence = confidence, api = api, + retry = retry, + logfile = logfile, types = types, support = support, verbose = if (progress) FALSE else verbose @@ -865,6 +993,8 @@ setMethod( max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, expand_to_token = FALSE, @@ -878,7 +1008,10 @@ setMethod( # text part. if (!is.null(text_tag)) { - nodes <- xml2::xml_find_all(x, xpath = namespaced_xpath(xml = x, tags = text_tag)) + nodes <- xml2::xml_find_all( + x, + xpath = namespaced_xpath(xml = x, tags = text_tag) + ) } else { nodes <- x @@ -895,18 +1028,22 @@ setMethod( if (is.null(segment)) { nodes_to_process <- nodes } else { - nodes_to_process <- xml2::xml_find_all(nodes, - xpath = namespaced_xpath(xml = x, - tags = segment)) + nodes_to_process <- xml2::xml_find_all( + nodes, + xpath = namespaced_xpath(xml = x, tags = segment) + ) } - if (verbose) cli_progress_step("preparing {.val {length(nodes_to_process)}} annotation tables.") - - docs <- to_annotation(nodes = nodes_to_process, - xml = x, - token_tags = token_tags, - feature_tag = feature_tag) + if (verbose) + cli_progress_step("preparing {.val {length(nodes_to_process)}} annotation tables.") + docs <- to_annotation( + nodes = nodes_to_process, + xml = x, + token_tags = token_tags, + feature_tag = feature_tag + ) + if (verbose) cli_progress_done() # prepare function to assign ID depending on value and arguments @@ -951,6 +1088,8 @@ setMethod( max_len = max_len, confidence = confidence, api = api, + retry = retry, + logfile = logfile, types = types, support = support, verbose = verbose @@ -1005,7 +1144,9 @@ setMethod( if (isTRUE(drop_inexact_annotations) & any(is.na(tab[["original_id"]]))) { missing_id_idx <- which(is.na(tab[["original_id"]])) - cli_alert_warning("Cannot map {length(missing_id_idx)} entit{?y/ies} exactly to tokenstream. Dropping {?it/them} from the annotation.") + cli_alert_warning( + "Cannot map {length(missing_id_idx)} entit{?y/ies} exactly to tokenstream. Dropping {?it/them} from the annotation." + ) tab <- tab[-missing_id_idx, ] } diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd index 9b0d223..992b44d 100644 --- a/man/get_dbpedia_uris.Rd +++ b/man/get_dbpedia_uris.Rd @@ -19,6 +19,8 @@ get_dbpedia_uris(x, ...) max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, types_src = c("DBpedia", "Wikidata"), @@ -31,6 +33,8 @@ get_dbpedia_uris(x, ...) max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, verbose = TRUE @@ -44,6 +48,8 @@ get_dbpedia_uris(x, ...) max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, expand_to_token = FALSE, @@ -58,6 +64,8 @@ get_dbpedia_uris(x, ...) s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, max_len = 5600L, @@ -72,6 +80,8 @@ get_dbpedia_uris(x, ...) max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, verbose = TRUE, @@ -88,6 +98,8 @@ get_dbpedia_uris(x, ...) max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), + retry = TRUE, + logfile = NULL, types = character(), support = 20, expand_to_token = FALSE, @@ -113,6 +125,10 @@ as threshold before DBpedia Spotlight includes a link into the report.} \item{api}{An URL of the DBpedia Spotlight API.} +\item{retry}{A \code{logical} value, whether to retry in case of a http error.} + +\item{logfile}{Filename for writing logs (e.g. for debugging purposes).} + \item{types}{A \code{character} vector to restrict result returned to certain entity types, such as 'Company' or 'Organization'. If the \code{character} vector is empty (default), no restrictions are applied.}