From 7371b00502fa64dd4d6fcab9869b087093432c4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 10 Apr 2024 17:31:51 +0200 Subject: [PATCH] address data.table R CMD check warnings --- NEWS.md | 1 + R/dbpedia.R | 50 ++++++++++++++++++++++++----------- R/overlaps.R | 34 ++++++++++++------------ R/segment.R | 17 +++++++++--- man/get_dbpedia_uris.Rd | 13 +++++++++ man/segment.Rd | 4 +-- tests/testthat/test-segment.R | 31 ++++++++++++++++++++++ 7 files changed, 111 insertions(+), 39 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5308978..03fd663 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ * A new auxiliary function `segment()` generates overlapping segments of text for strings longer than the maximum nchar that can be processed by DBpedia Spotlight. * Method `get_dbpedia_uris()` has new argument `overlap` passed into `segment()`. +* Method `get_dbpedia_uris()` has new argument `offset` to indicate base offset number other than 1. ## dbpedia v0.1.2.9004 diff --git a/R/dbpedia.R b/R/dbpedia.R index 2a3225b..a17dcbc 100644 --- a/R/dbpedia.R +++ b/R/dbpedia.R @@ -1,5 +1,5 @@ `:=` <- function(...) NULL -.SD <- NULL +.SD <- .GRP <- .I <- .N <- NULL #' Set and report status of DBpedia Spotlight #' @@ -326,6 +326,7 @@ setGeneric( #' @exportMethod get_dbpedia_uris #' @rdname get_dbpedia_uris #' @importFrom data.table data.table +#' @importFrom utils capture.output #' @examples #' \dontrun{ #' # Process AnnotatedPlainTextDocument (example available in NLP package) @@ -361,6 +362,7 @@ setMethod( language = getOption("dbpedia.lang"), max_len = 5600L, overlap = 500L, + offset = 1L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -378,13 +380,14 @@ setMethod( ) segs <- segment(x = x, max_len = max_len, overlap = overlap) dts <- lapply( - segs, - function(seg){ + seq_along(segs), + function(i){ get_dbpedia_uris( - x = seg, + x = segs[[i]], language = language, max_len = max_len, # input 'seg' must be below this threshold overlap = overlap, # may not be needed + offset = as.integer(names(segs)[i]), confidence = confidence, api = api, retry = retry, @@ -398,18 +401,21 @@ setMethod( } ) - pos <- as.integer(names(segs)) + offset <- as.integer(names(segs)) for (i in seq_along(dts)){ - if (i == 1){ - breakpoint <- (nchar(dts[[1L]]) - pos[2L]) / 2 - dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint] + if (i == 1L){ + breakpoint_r <- offset[2L] + (nchar(segs[1L]) - offset[2L]) / 2 + dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint_r] } else if (i == length(dts)){ - breakpoint <- ((pos[i - 1L] + nchar(segs[i - 1L]) - 1L) - pos[i]) / 2 - dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint] + offset_prev <- offset[i - 1L] + nchar(segs[i - 1L]) - 1L + breakpoint_l <- offset[i] + ((offset_prev - offset[i]) / 2) + dts[[i]] <- dts[[i]][dts[[i]][["start"]] >= breakpoint_l] } else { - breakpoint_l <- ((pos[i - 1L] + nchar(segs[i - 1L] - 1L)) - pos[i]) / 2 - breakpoint_r <- ((pos[i] + nchar(segs[i] - 1L)) - pos[i + 1]) / 2 - dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint_l] + offset_prev <- offset[i - 1L] + nchar(segs[i - 1L]) - 1L + breakpoint_l <- offset[i] + ((offset_prev - offset[i]) / 2) + rbound_current <- offset[i] + nchar(segs[i]) - 1L + breakpoint_r <- offset[i + 1] + ((rbound_current - offset[i + 1]) / 2) + dts[[i]] <- dts[[i]][dts[[i]][["start"]] >= breakpoint_l] dts[[i]] <- dts[[i]][dts[[i]][["start"]] < breakpoint_r] } } @@ -428,7 +434,7 @@ setMethod( ) if (verbose) cli_progress_step("send request to DBpedia Spotlight") - request_max <- if (is.logical(retry)) as.integer(request) else retry + request_max <- if (is.logical(retry)) as.integer(retry) else retry request_number <- 1L proceed <- TRUE @@ -523,8 +529,8 @@ setMethod( ) setcolorder(resources_min, c("start", "text", "dbpedia_uri", "types")) - resources_min[, "start" := as.integer(resources_min[["start"]]) + 1L] - + resources_min[, "start" := as.integer(resources_min[["start"]]) + offset] + # See issue 41. types_list <- strsplit(x = resources_min[["types"]], split = ",") @@ -581,6 +587,7 @@ setMethod( x, language = getOption("dbpedia.lang"), max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -594,6 +601,7 @@ setMethod( x = as.character(x[["content"]]), language = language, max_len = max_len, + overlap = overlap, confidence = confidence, api = api, retry = retry, @@ -626,6 +634,8 @@ setMethod( #' threshold of 5600 characters is the default value. #' @param overlap If the input string `x` is longer than `max_len`, the numnber #' of overlapping characters (passed into `segment()`). +#' @param offset An integer value with the base offset position of the text to +#' be annotated. #' @param language The language of the input text ("en", "fr", "de", ...) to #' determine the stopwords used. #' @param confidence A `numeric` value, the minimum similarity score that serves @@ -706,6 +716,7 @@ setMethod( p_attribute = "word", s_attribute = NULL, max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -744,6 +755,7 @@ setMethod( x = doc, language = language, max_len = max_len, + overlap = overlap, confidence = confidence, api = api, retry = retry, @@ -890,6 +902,7 @@ setMethod( types = character(), support = 20, max_len = 5600L, + overlap = 1000L, expand_to_token = FALSE, verbose = TRUE, progress = FALSE @@ -909,6 +922,7 @@ setMethod( language = language, s_attribute = s_attribute, max_len = max_len, + overlap = overlap, confidence = confidence, api = api, retry = retry, @@ -956,6 +970,7 @@ setMethod( x, language = getOption("dbpedia.lang"), max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -992,6 +1007,7 @@ setMethod( x = docs[[docname]], language = language, max_len = max_len, + overlap = overlap, confidence = confidence, api = api, retry = retry, @@ -1032,6 +1048,7 @@ setMethod( token_tags = c("w", "pc"), text_tag = NULL, max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -1127,6 +1144,7 @@ setMethod( x = doc, language = language, max_len = max_len, + overlap = overlap, confidence = confidence, api = api, retry = retry, diff --git a/R/overlaps.R b/R/overlaps.R index de2d7da..71930e5 100644 --- a/R/overlaps.R +++ b/R/overlaps.R @@ -64,7 +64,7 @@ detect_overlap <- function(x, } if ("doc" %in% colnames(x)) { - x[, ovl_id := detect_overlap_aux(.SD, + x[, "ovl_id" := detect_overlap_aux(.SD, group_id = .GRP, start_col = start_col, end_col = end_col, @@ -73,7 +73,7 @@ detect_overlap <- function(x, } else { - x[, ovl_id := detect_overlap_aux(input_dt = x, + x[, "ovl_id" := detect_overlap_aux(input_dt = x, group_id = NULL, start_col = start_col, end_col = end_col, @@ -192,7 +192,7 @@ detect_overlap_aux <- function(input_dt, } # merge to input - ovl_dt[overlaps_out_long, on = "row_idx", ovl_id := i.overlap_id] + ovl_dt[overlaps_out_long, on = "row_idx", "ovl_id" := i.overlap_id] retval <- ovl_dt[["ovl_id"]] } @@ -304,7 +304,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp } # Create new column with document type "NA" - x[, ovl_type := ifelse(is.na(x[["ovl_id"]]), NA_character_, "ovl_undetermined")] + x[, "ovl_type" := ifelse(is.na(x[["ovl_id"]]), NA_character_, "ovl_undetermined")] # set key for later foverlaps setkeyv(x, c(start_col, end_col)) @@ -320,7 +320,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp .SD[["text"]], .SD[["types"]], get_outer_inner_ovl_aux(.SD, start_col = start_col, end_col = end_col, verbose = verbose)), - by = ovl_id] + by = "ovl_id"] # For "partial" matches, create an inner and an outer version of the # annotation. This is currently experimental as it introduces annotations not @@ -332,8 +332,8 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp cli_alert_info(text = "Finding outer and inner segments for partial matches. This is experimental.") } - if (x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"), .N] > 0) { - overlaps_outer_dt <- x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"), + if (x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"), .N] > 0) { + overlaps_outer_dt <- x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"), list( doc = ifelse("doc" %in% colnames(.SD), .SD[["doc"]], NA), start = min(.SD[[start_col]]), @@ -343,9 +343,9 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp types = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, unique(.SD[["types"]]), list(list())), ovl_type = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, "ovl_partial|ovl_outer", "ovl_partial|ovl_multiple|ovl_outer") ), - by = ovl_id] + by = "ovl_id"] - overlaps_inner_dt <- x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"), + overlaps_inner_dt <- x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"), list( doc = ifelse("doc" %in% colnames(.SD), .SD[["doc"]], NA), start = min(get_inner_overlap_range(.SD, start_col = start_col, end_col = end_col)), @@ -357,7 +357,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp list(list())), ovl_type = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, "ovl_partial|ovl_inner", "ovl_partial|ovl_multiple|ovl_inner") ), - by = ovl_id] + by = "ovl_id"] add_ents_dt <- rbind(overlaps_outer_dt, overlaps_inner_dt) @@ -381,10 +381,10 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp cols <- c("ovl_longest", "ovl_shortest", "ovl_inner", "ovl_outer", "ovl_partial", "ovl_multiple", "ovl_distinct", "ovl_undetermined") - x[!is.na(ovl_id), (cols) := lapply(cols, function(x) grepl(pattern = x, ovl_type)), by = .I] + x[!is.na(x[["ovl_id"]]), (cols) := lapply(cols, function(x) grepl(pattern = x, x[["ovl_type"]])), by = .I] # after this, the ovl_type column is not needed anymore. - x[, ovl_type := NULL] + x[, "ovl_type" := NULL] if ("doc" %in% colnames(x)) { setorderv(x, c("doc", start_col)) @@ -558,7 +558,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) { ovl_unique_before <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]])) # number of overlaps # first, keep all non-overlapping entities - x[is.na(ovl_id), ovl_keep := 1L] + x[is.na(x[["ovl_id"]]), ovl_keep := 1L] if (isTRUE(verbose)) { cli_alert_info("Identifing entities to {.strong keep}.") @@ -566,7 +566,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) { for (i in seq_along(keep)) { keep_i <- paste0("ovl_", keep[i]) - x[x[, .I[which(get(keep_i) == TRUE)], by = ovl_id]$V1, c("ovl_keep", "ovl_by") := list(i, keep[i])] + x[x[, .I[which(get(keep_i) == TRUE)], by = "ovl_id"]$V1, c("ovl_keep", "ovl_by") := list(i, keep[i])] } if (!is.null(omit)) { @@ -577,7 +577,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) { for (i in seq_along(omit)) { omit_i <- paste0("ovl_", omit[i]) - x[x[, .I[which(get(omit_i) == TRUE)], by = ovl_id]$V1, ovl_keep := -1L] + x[x[, .I[which(get(omit_i) == TRUE)], by = "ovl_id"]$V1, ovl_keep := -1L] } } @@ -616,10 +616,10 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) { ) } - x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = ovl_id] + x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = "ovl_id"] x <- x[x[, .I[which(.SD[["ovl_keep"]] > 0 & .SD[["ovl_keep"]] == min(.SD[["ovl_keep"]], na.rm = TRUE))], by = ovl_id][["V1"]], ] - ovl_unique_after <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]])) + ovl_unique_after <- length(unique(x[!is.na(x[["ovl_id"]]), ][["ovl_id"]])) if (isTRUE(verbose)) { cli::cli_alert_info( diff --git a/R/segment.R b/R/segment.R index 7381f24..430e658 100644 --- a/R/segment.R +++ b/R/segment.R @@ -29,13 +29,14 @@ segment <- function(x, max_len = 7900L, overlap = 500L){ df[["esc"]] <- curl::curl_escape(df[["src"]]) df[["begin_esc"]] <- cumsum(c(1L, (nchar(df$esc) + 3L)[1L:(nrow(df) - 1L)])) + df[["end_esc"]] <- df[["begin_esc"]] + nchar(df[["esc"]]) # The total number of characters of the escaped string is the beginning of # the last offset plus the nchar of the last token nchar_esc <- df$begin_esc[nrow(df)] + nchar(df$esc[nrow(df)]) - 1L # based on paper & pencil math - n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap)) + n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap)) + 2 if (n_segments > 1){ half <- floor(max_len / 2) @@ -57,13 +58,13 @@ segment <- function(x, max_len = 7900L, overlap = 500L){ from <- if (i == 1L){ 1L } else { - max(which(df[["begin_esc"]] <= (anchors[i] - half))) + min(which(df[["begin_esc"]] > (anchors[i] - half))) } to <- if (i == length(anchors)){ nrow(df) } else { - min(which(df[["begin_esc"]] >= (anchors[i] + half))) + max(which(df[["end_esc"]] < (anchors[i] + half))) } df[from:to,] } @@ -75,5 +76,13 @@ segment <- function(x, max_len = 7900L, overlap = 500L){ segments <- list(x) names(segments) <- as.character(1) } - as.character(segments) + + nchar_seg_esc <- nchar( + unlist(lapply(lapply(y, `[[`, "esc"), paste, collapse = "%20")) + ) + + if (any(nchar_seg_esc > max_len)) + cli_alert_warning("segments exceed `max_len`") + + unlist(segments) } diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd index 5068776..f10c38e 100644 --- a/man/get_dbpedia_uris.Rd +++ b/man/get_dbpedia_uris.Rd @@ -17,6 +17,8 @@ get_dbpedia_uris(x, ...) x, language = getOption("dbpedia.lang"), max_len = 5600L, + overlap = 500L, + offset = 1L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -31,6 +33,7 @@ get_dbpedia_uris(x, ...) x, language = getOption("dbpedia.lang"), max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -46,6 +49,7 @@ get_dbpedia_uris(x, ...) p_attribute = "word", s_attribute = NULL, max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -69,6 +73,7 @@ get_dbpedia_uris(x, ...) types = character(), support = 20, max_len = 5600L, + overlap = 1000L, expand_to_token = FALSE, verbose = TRUE, progress = FALSE @@ -78,6 +83,7 @@ get_dbpedia_uris(x, ...) x, language = getOption("dbpedia.lang"), max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -96,6 +102,7 @@ get_dbpedia_uris(x, ...) token_tags = c("w", "pc"), text_tag = NULL, max_len = 5600L, + overlap = 1000L, confidence = 0.35, api = getOption("dbpedia.endpoint"), retry = TRUE, @@ -120,6 +127,12 @@ determine the stopwords used.} not exceed a defined length. If it does, an HTTP error results. The known threshold of 5600 characters is the default value.} +\item{overlap}{If the input string \code{x} is longer than \code{max_len}, the numnber +of overlapping characters (passed into \code{segment()}).} + +\item{offset}{An integer value with the base offset position of the text to +be annotated.} + \item{confidence}{A \code{numeric} value, the minimum similarity score that serves as threshold before DBpedia Spotlight includes a link into the report.} diff --git a/man/segment.Rd b/man/segment.Rd index d7fd5e0..2ff699c 100644 --- a/man/segment.Rd +++ b/man/segment.Rd @@ -15,8 +15,8 @@ processed.} \item{overlap}{Number of overlapping characters.} } \value{ -A named list of character vectors. The names are integer numbers that -indicate the character offset from the original string. +A named character vector. The names are integer numbers that indicate +the character offset from the original string. } \description{ Strings that are too long to be processed by DBpedia Spotlight are cut into diff --git a/tests/testthat/test-segment.R b/tests/testthat/test-segment.R index ce847bc..37e2fb9 100644 --- a/tests/testthat/test-segment.R +++ b/tests/testthat/test-segment.R @@ -27,4 +27,35 @@ test_that( expect_identical(nchar(article), nchar(article_reconstructed)) expect_identical(article, article_reconstructed) } +) + + +test_that( + "identity of results", + { + article <- corpus("REUTERS") %>% + polmineR::subset(id == "236") %>% # the longest article in the REUTERS corpus + get_token_stream(p_attribute = "word", collapse = " ") + + dbpedia_uris_ref <- get_dbpedia_uris( + x = article, + api = "http://api.dbpedia-spotlight.org/en/annotate", + language = "en", + verbose = FALSE, + max_len = 7500L + ) + + dbpedia_uris_seg <- get_dbpedia_uris( + x = article, + api = "http://api.dbpedia-spotlight.org/en/annotate", + language = "en", + verbose = FALSE, + max_len = 2000, + overlap = 750 + ) + + expect_identical(dbpedia_uris_ref, dbpedia_uris_seg) + + expect_identical(dbpedia_uris_ref, dbpedia_uris_seg) + } ) \ No newline at end of file