process segmented vectors with overlength

PolMine · Apr 10, 2024 · 759940d · 759940d
1 parent e3f3c41
commit 759940d
Show file tree

Hide file tree

Showing 8 changed files with 200 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: dbpedia
 Type: Package
 Title: R Wrapper for DBpedia Spotlight
 Version: 0.1.2.9005
-Date: 2024-03-31
+Date: 2024-04-10
 Authors@R: c(
   person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
   person("Christoph", "Leonhardt", role = "aut")
@@ -17,6 +17,7 @@ Imports:
     data.table,
     tibble,
     httr,
+    curl,
     jsonlite,
     RcppCWB,
     fs,
@@ -52,6 +53,7 @@ Collate:
     'xml.R'
     'entity_types.R'
     'overlaps.R'
+    'segment.R'
     'zzz.R'
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3

diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ export(dbpedia_spotlight_status)
 export(detect_overlap)
 export(namespaced_xpath)
 export(resolve_overlap)
+export(segment)
 export(sparql_query)
 export(wikidata_query)
 export(xml_enrich)
@@ -34,6 +35,7 @@ importFrom(cli,cli_text)
 importFrom(cli,col_cyan)
 importFrom(cli,format_error)
 importFrom(cli,style_bold)
+importFrom(curl,curl_escape)
 importFrom(data.table,`:=`)
 importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+## dbpedia v0.1.2.9005
+
+* A new auxiliary function `segment()` generates overlapping segments of text for strings longer than the maximum nchar that can be processed by DBpedia Spotlight.
+* Method `get_dbpedia_uris()` has new argument `overlap` passed into `segment()`. 
+
+
 ## dbpedia v0.1.2.9004
 
 * Method `get_dbpedia_uris()` has new argument `retry` to retry if API is stalled #45 and new argument `logfile` for tracking and debugging longrunning annotation tasks. If the annotation failes, `NULL` is returned (no abort).

diff --git a/R/dbpedia.R b/R/dbpedia.R
@@ -360,6 +360,7 @@ setMethod(
     x,
     language = getOption("dbpedia.lang"),
     max_len = 5600L,
+    overlap = 500L,
     confidence = 0.35,
     api = getOption("dbpedia.endpoint"),
     retry = TRUE,
@@ -370,11 +371,49 @@ setMethod(
     verbose = TRUE
   ) {
 
-    if (nchar(x) > max_len) {
+    nchar_escaped <- nchar(curl::curl_escape(x))
+    if (nchar_escaped > max_len) {
       if (verbose) cli_alert_warning(
-        "input text has length {nchar(x)}, truncate to max_len ({.val {max_len}})"
+        "number of characters of escaped input string is {.val {nchar_escaped}} - will process segmented string"
       )
-      x <- substr(x, 1L, max_len)
+      segs <- segment(x = x, max_len = max_len, overlap = overlap)
+      dts <- lapply(
+        segs,
+        function(seg){
+          get_dbpedia_uris(
+            x = seg,
+            language = language,
+            max_len = max_len, # input 'seg' must be below this threshold
+            overlap = overlap, # may not be needed
+            confidence = confidence,
+            api = api,
+            retry = retry,
+            logfile = logfile,
+            types = types,
+            support = support,
+            types_src = types_src,
+            verbose = verbose
+
+          )
+        }
+      )
+
+      pos <- as.integer(names(segs))
+      for (i in seq_along(dts)){
+        if (i == 1){
+          breakpoint <- (nchar(dts[[1L]]) - pos[2L]) / 2
+          dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint]
+        } else if (i == length(dts)){
+          breakpoint <- ((pos[i - 1L] + nchar(segs[i - 1L]) - 1L) - pos[i]) / 2
+          dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint]
+        } else {
+          breakpoint_l <- ((pos[i - 1L] + nchar(segs[i - 1L] - 1L)) - pos[i]) / 2
+          breakpoint_r <- ((pos[i] + nchar(segs[i] - 1L)) - pos[i + 1]) / 2
+          dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint_l]
+          dts[[i]] <- dts[[i]][dts[[i]][["start"]] < breakpoint_r]
+        }
+      }
+      return(rbindlist(dts))
     }
 
     if (!is.numeric(support) | !(length(support) == 1)) {
@@ -585,6 +624,8 @@ setMethod(
 #' @param max_len An `integer` value. The text passed to DBpedia Spotlight may
 #'   not exceed a defined length. If it does, an HTTP error results. The known
 #'   threshold of 5600 characters is the default value.
+#' @param overlap If the input string `x` is longer than `max_len`, the numnber
+#'   of overlapping characters (passed into `segment()`).
 #' @param language The language of the input text ("en", "fr", "de", ...) to
 #'   determine the stopwords used.
 #' @param confidence A `numeric` value, the minimum similarity score that serves

diff --git a/R/segment.R b/R/segment.R
@@ -0,0 +1,79 @@
+#' Cut long string into overlapping segments
+#' 
+#' Strings that are too long to be processed by DBpedia Spotlight are cut into
+#' overlapping segments that can be processed. Overlaps ensure that contextual
+#' information is available for all entities.
+#' 
+#' @return A named character vector. The names are integer numbers that indicate
+#'   the character offset from the original string.
+#' @param x A string (length-one character vector) to process.
+#' @param max_len The maximum length of (URL-escaped!) string that can be
+#'   processed.
+#' @param overlap Number of overlapping characters.
+#' @export
+#' @importFrom curl curl_escape
+#' @examples
+#' library(polmineR)
+#' use("RcppCWB") # make REUTERS corpus available
+#' 
+#' article <- corpus("REUTERS") %>%
+#'   subset(id == "236") %>% # the longest article in the REUTERS corpus
+#'   get_token_stream(p_attribute = "word", collapse = " ")
+#' 
+#' segs <- segment(x = article, max_len = 500, overlap = 100)
+segment <- function(x, max_len = 7900L, overlap = 500L){
+  # check that length(x) == 1L
+
+  df <- data.frame(src = strsplit(x, split = " ")[[1]])
+  df[["begin"]] <- cumsum(c(1L, (nchar(df$src) + 1L)[1L:(nrow(df) - 1L)]))
+
+  df[["esc"]] <- curl::curl_escape(df[["src"]])
+  df[["begin_esc"]] <- cumsum(c(1L, (nchar(df$esc) + 3L)[1L:(nrow(df) - 1L)]))
+
+  # The total number of characters of the escaped string is the beginning of 
+  # the last offset plus the nchar of the last token
+  nchar_esc <- df$begin_esc[nrow(df)] + nchar(df$esc[nrow(df)]) - 1L
+
+  # based on paper & pencil math
+  n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap))
+
+  if (n_segments > 1){
+    half <- floor(max_len / 2)
+    last <- nchar_esc - half
+    anchors <- c(half, last)
+
+    if (n_segments > 2){
+      anchors <- sort(c(
+        anchors,
+        half + cumsum(
+          rep((last - half) / (n_segments - 1), times = n_segments - 2)
+        )
+      ))
+    }
+
+    y <- lapply(
+      seq_along(anchors),
+      function(i){
+        from <- if (i == 1L){
+          1L
+        } else {
+          max(which(df[["begin_esc"]] <= (anchors[i] - half)))
+        }
+
+        to <- if (i == length(anchors)){
+          nrow(df)
+        } else {
+          min(which(df[["begin_esc"]] >= (anchors[i] + half)))
+        }
+        df[from:to,]
+      }
+    )
+
+    segments <- lapply(lapply(y, `[[`, "src"), paste, collapse = " ")
+    names(segments) <- lapply(y, `[`, 1, "begin")
+  } else {
+    segments <- list(x)
+    names(segments) <- as.character(1)
+  }
+  as.character(segments)
+}
diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd
diff --git a/man/segment.Rd b/man/segment.Rd
diff --git a/tests/testthat/test-segment.R b/tests/testthat/test-segment.R
@@ -0,0 +1,30 @@
+library(polmineR)
+use("RcppCWB") # make REUTERS corpus available
+
+test_that(
+  "ensure that segments add up to original string",
+  {
+    article <- corpus("REUTERS") %>%
+      polmineR::subset(id == "236") %>% # the longest article in the REUTERS corpus
+      get_token_stream(p_attribute = "word", collapse = " ")
+
+    segs <- segment(x = article, max_len = 500, overlap = 100)
+
+    # we grow the reconstructed string ...
+    article_reconstructed <- character()
+    for (i in seq_along(segs)){
+      article_reconstructed <- paste(
+        substr(
+          article_reconstructed,
+          start = 1L,
+          stop = as.integer(names(segs)[[i]]) - 1L
+        ),
+        segs[[i]],
+        sep = ""
+      )
+    }
+
+    expect_identical(nchar(article), nchar(article_reconstructed))
+    expect_identical(article, article_reconstructed)
+  }
+)