From 7371b00502fa64dd4d6fcab9869b087093432c4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?=
 <andreasblatte@MBP-von-Andreas.fritz.box>
Date: Wed, 10 Apr 2024 17:31:51 +0200
Subject: [PATCH] address data.table R CMD check warnings

---
 NEWS.md                       |  1 +
 R/dbpedia.R                   | 50 ++++++++++++++++++++++++-----------
 R/overlaps.R                  | 34 ++++++++++++------------
 R/segment.R                   | 17 +++++++++---
 man/get_dbpedia_uris.Rd       | 13 +++++++++
 man/segment.Rd                |  4 +--
 tests/testthat/test-segment.R | 31 ++++++++++++++++++++++
 7 files changed, 111 insertions(+), 39 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 5308978..03fd663 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,7 @@
 
 * A new auxiliary function `segment()` generates overlapping segments of text for strings longer than the maximum nchar that can be processed by DBpedia Spotlight.
 * Method `get_dbpedia_uris()` has new argument `overlap` passed into `segment()`. 
+* Method `get_dbpedia_uris()` has new argument `offset` to indicate base offset number other than 1.
 
 
 ## dbpedia v0.1.2.9004
diff --git a/R/dbpedia.R b/R/dbpedia.R
index 2a3225b..a17dcbc 100644
--- a/R/dbpedia.R
+++ b/R/dbpedia.R
@@ -1,5 +1,5 @@
 `:=` <- function(...) NULL
-.SD <- NULL
+.SD <- .GRP <- .I <- .N <- NULL
 
 #' Set and report status of DBpedia Spotlight
 #' 
@@ -326,6 +326,7 @@ setGeneric(
 #' @exportMethod get_dbpedia_uris
 #' @rdname get_dbpedia_uris
 #' @importFrom data.table data.table
+#' @importFrom utils capture.output
 #' @examples
 #' \dontrun{
 #' # Process AnnotatedPlainTextDocument (example available in NLP package)
@@ -361,6 +362,7 @@ setMethod(
     language = getOption("dbpedia.lang"),
     max_len = 5600L,
     overlap = 500L,
+    offset = 1L,
     confidence = 0.35,
     api = getOption("dbpedia.endpoint"),
     retry = TRUE,
@@ -378,13 +380,14 @@ setMethod(
       )
       segs <- segment(x = x, max_len = max_len, overlap = overlap)
       dts <- lapply(
-        segs,
-        function(seg){
+        seq_along(segs),
+        function(i){
           get_dbpedia_uris(
-            x = seg,
+            x = segs[[i]],
             language = language,
             max_len = max_len, # input 'seg' must be below this threshold
             overlap = overlap, # may not be needed
+            offset = as.integer(names(segs)[i]),
             confidence = confidence,
             api = api,
             retry = retry,
@@ -398,18 +401,21 @@ setMethod(
         }
       )
       
-      pos <- as.integer(names(segs))
+      offset <- as.integer(names(segs))
       for (i in seq_along(dts)){
-        if (i == 1){
-          breakpoint <- (nchar(dts[[1L]]) - pos[2L]) / 2
-          dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint]
+        if (i == 1L){
+          breakpoint_r <- offset[2L] + (nchar(segs[1L]) - offset[2L]) / 2
+          dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint_r]
         } else if (i == length(dts)){
-          breakpoint <- ((pos[i - 1L] + nchar(segs[i - 1L]) - 1L) - pos[i]) / 2
-          dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint]
+          offset_prev <- offset[i - 1L] + nchar(segs[i - 1L]) - 1L
+          breakpoint_l <- offset[i] + ((offset_prev - offset[i]) / 2)
+          dts[[i]] <- dts[[i]][dts[[i]][["start"]] >= breakpoint_l]
         } else {
-          breakpoint_l <- ((pos[i - 1L] + nchar(segs[i - 1L] - 1L)) - pos[i]) / 2
-          breakpoint_r <- ((pos[i] + nchar(segs[i] - 1L)) - pos[i + 1]) / 2
-          dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint_l]
+          offset_prev <- offset[i - 1L] + nchar(segs[i - 1L]) - 1L
+          breakpoint_l <- offset[i] + ((offset_prev - offset[i]) / 2)
+          rbound_current <- offset[i] + nchar(segs[i]) - 1L
+          breakpoint_r <- offset[i + 1] + ((rbound_current - offset[i + 1]) / 2)
+          dts[[i]] <- dts[[i]][dts[[i]][["start"]] >= breakpoint_l]
           dts[[i]] <- dts[[i]][dts[[i]][["start"]] < breakpoint_r]
         }
       }
@@ -428,7 +434,7 @@ setMethod(
     )
     
     if (verbose) cli_progress_step("send request to DBpedia Spotlight")
-    request_max <- if (is.logical(retry)) as.integer(request) else retry
+    request_max <- if (is.logical(retry)) as.integer(retry) else retry
     request_number <- 1L
     proceed <- TRUE
     
@@ -523,8 +529,8 @@ setMethod(
     )
     setcolorder(resources_min, c("start", "text", "dbpedia_uri", "types"))
     
-    resources_min[, "start" := as.integer(resources_min[["start"]]) + 1L]
-    
+    resources_min[, "start" := as.integer(resources_min[["start"]]) + offset]
+
     # See issue 41.
     types_list <- strsplit(x = resources_min[["types"]], split = ",")
     
@@ -581,6 +587,7 @@ setMethod(
     x,
     language = getOption("dbpedia.lang"),
     max_len = 5600L,
+    overlap = 1000L,
     confidence = 0.35,
     api = getOption("dbpedia.endpoint"),
     retry = TRUE,
@@ -594,6 +601,7 @@ setMethod(
       x = as.character(x[["content"]]),
       language = language,
       max_len = max_len,
+      overlap = overlap,
       confidence = confidence,
       api = api,
       retry = retry,
@@ -626,6 +634,8 @@ setMethod(
 #'   threshold of 5600 characters is the default value.
 #' @param overlap If the input string `x` is longer than `max_len`, the numnber
 #'   of overlapping characters (passed into `segment()`).
+#' @param offset An integer value with the base offset position of the text to
+#'   be annotated.
 #' @param language The language of the input text ("en", "fr", "de", ...) to
 #'   determine the stopwords used.
 #' @param confidence A `numeric` value, the minimum similarity score that serves
@@ -706,6 +716,7 @@ setMethod(
     p_attribute = "word",
     s_attribute = NULL,
     max_len = 5600L,
+    overlap = 1000L,
     confidence = 0.35,
     api = getOption("dbpedia.endpoint"),
     retry = TRUE,
@@ -744,6 +755,7 @@ setMethod(
     x = doc,
     language = language,
     max_len = max_len,
+    overlap = overlap,
     confidence = confidence,
     api = api,
     retry = retry,
@@ -890,6 +902,7 @@ setMethod(
     types = character(),
     support = 20,
     max_len = 5600L,
+    overlap = 1000L,
     expand_to_token = FALSE,
     verbose = TRUE,
     progress = FALSE
@@ -909,6 +922,7 @@ setMethod(
         language = language,
         s_attribute = s_attribute,
         max_len = max_len,
+        overlap = overlap,
         confidence = confidence,
         api = api,
         retry = retry,
@@ -956,6 +970,7 @@ setMethod(
     x,
     language = getOption("dbpedia.lang"),
     max_len = 5600L,
+    overlap = 1000L,
     confidence = 0.35,
     api = getOption("dbpedia.endpoint"),
     retry = TRUE,
@@ -992,6 +1007,7 @@ setMethod(
             x = docs[[docname]],
             language = language,
             max_len = max_len,
+            overlap = overlap,
             confidence = confidence,
             api = api,
             retry = retry,
@@ -1032,6 +1048,7 @@ setMethod(
     token_tags = c("w", "pc"),
     text_tag = NULL,
     max_len = 5600L,
+    overlap = 1000L,
     confidence = 0.35,
     api = getOption("dbpedia.endpoint"),
     retry = TRUE,
@@ -1127,6 +1144,7 @@ setMethod(
       x = doc,
       language = language,
       max_len = max_len,
+      overlap = overlap,
       confidence = confidence,
       api = api,
       retry = retry,
diff --git a/R/overlaps.R b/R/overlaps.R
index de2d7da..71930e5 100644
--- a/R/overlaps.R
+++ b/R/overlaps.R
@@ -64,7 +64,7 @@ detect_overlap <- function(x,
   }
 
   if ("doc" %in% colnames(x)) {
-    x[, ovl_id := detect_overlap_aux(.SD,
+    x[, "ovl_id" := detect_overlap_aux(.SD,
                                      group_id = .GRP,
                                      start_col = start_col,
                                      end_col = end_col,
@@ -73,7 +73,7 @@ detect_overlap <- function(x,
 
   } else {
 
-    x[, ovl_id := detect_overlap_aux(input_dt = x,
+    x[, "ovl_id" := detect_overlap_aux(input_dt = x,
                                      group_id = NULL,
                                      start_col = start_col,
                                      end_col = end_col,
@@ -192,7 +192,7 @@ detect_overlap_aux <- function(input_dt,
     }
 
     # merge to input
-    ovl_dt[overlaps_out_long, on = "row_idx", ovl_id := i.overlap_id]
+    ovl_dt[overlaps_out_long, on = "row_idx", "ovl_id" := i.overlap_id]
 
     retval <- ovl_dt[["ovl_id"]]
   }
@@ -304,7 +304,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
   }
 
   # Create new column with document type "NA"
-  x[, ovl_type := ifelse(is.na(x[["ovl_id"]]), NA_character_, "ovl_undetermined")]
+  x[, "ovl_type" := ifelse(is.na(x[["ovl_id"]]), NA_character_, "ovl_undetermined")]
 
   # set key for later foverlaps
   setkeyv(x, c(start_col, end_col))
@@ -320,7 +320,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
     .SD[["text"]],
     .SD[["types"]],
     get_outer_inner_ovl_aux(.SD, start_col = start_col, end_col = end_col, verbose = verbose)),
-    by = ovl_id]
+    by = "ovl_id"]
 
   # For "partial" matches, create an inner and an outer version of the
   # annotation. This is currently experimental as it introduces annotations not
@@ -332,8 +332,8 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
       cli_alert_info(text = "Finding outer and inner segments for partial matches. This is experimental.")
     }
 
-    if (x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"), .N] > 0) {
-      overlaps_outer_dt <- x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
+    if (x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"), .N] > 0) {
+      overlaps_outer_dt <- x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
                              list(
                                doc = ifelse("doc" %in% colnames(.SD), .SD[["doc"]], NA),
                                start = min(.SD[[start_col]]),
@@ -343,9 +343,9 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
                                types = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, unique(.SD[["types"]]), list(list())),
                                ovl_type = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, "ovl_partial|ovl_outer", "ovl_partial|ovl_multiple|ovl_outer")
                              ),
-                             by = ovl_id]
+                             by = "ovl_id"]
 
-      overlaps_inner_dt <- x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
+      overlaps_inner_dt <- x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
                              list(
                                doc = ifelse("doc" %in% colnames(.SD), .SD[["doc"]], NA),
                                start = min(get_inner_overlap_range(.SD, start_col = start_col, end_col = end_col)),
@@ -357,7 +357,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
                                               list(list())),
                                ovl_type = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, "ovl_partial|ovl_inner", "ovl_partial|ovl_multiple|ovl_inner")
                              ),
-                             by = ovl_id]
+                             by = "ovl_id"]
 
       add_ents_dt <- rbind(overlaps_outer_dt, overlaps_inner_dt)
 
@@ -381,10 +381,10 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
   cols <- c("ovl_longest", "ovl_shortest", "ovl_inner", "ovl_outer",
             "ovl_partial", "ovl_multiple", "ovl_distinct", "ovl_undetermined")
 
-  x[!is.na(ovl_id), (cols) := lapply(cols, function(x) grepl(pattern = x, ovl_type)), by = .I]
+  x[!is.na(x[["ovl_id"]]), (cols) := lapply(cols, function(x) grepl(pattern = x, x[["ovl_type"]])), by = .I]
 
   # after this, the ovl_type column is not needed anymore.
-  x[, ovl_type := NULL]
+  x[, "ovl_type" := NULL]
 
   if ("doc" %in% colnames(x)) {
     setorderv(x, c("doc", start_col))
@@ -558,7 +558,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
   ovl_unique_before <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]])) # number of overlaps
 
   # first, keep all non-overlapping entities
-  x[is.na(ovl_id), ovl_keep := 1L]
+  x[is.na(x[["ovl_id"]]), ovl_keep := 1L]
 
   if (isTRUE(verbose)) {
     cli_alert_info("Identifing entities to {.strong keep}.")
@@ -566,7 +566,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
 
   for (i in seq_along(keep)) {
     keep_i <- paste0("ovl_", keep[i])
-    x[x[, .I[which(get(keep_i) == TRUE)], by = ovl_id]$V1, c("ovl_keep", "ovl_by") := list(i, keep[i])]
+    x[x[, .I[which(get(keep_i) == TRUE)], by = "ovl_id"]$V1, c("ovl_keep", "ovl_by") := list(i, keep[i])]
   }
 
   if (!is.null(omit)) {
@@ -577,7 +577,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
 
     for (i in seq_along(omit)) {
       omit_i <- paste0("ovl_", omit[i])
-      x[x[, .I[which(get(omit_i) == TRUE)], by = ovl_id]$V1, ovl_keep := -1L]
+      x[x[, .I[which(get(omit_i) == TRUE)], by = "ovl_id"]$V1, ovl_keep := -1L]
     }
   }
 
@@ -616,10 +616,10 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
     )
   }
 
-  x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = ovl_id]
+  x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = "ovl_id"]
   x <- x[x[, .I[which(.SD[["ovl_keep"]] > 0 & .SD[["ovl_keep"]] == min(.SD[["ovl_keep"]], na.rm = TRUE))], by = ovl_id][["V1"]], ]
 
-  ovl_unique_after <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]]))
+  ovl_unique_after <- length(unique(x[!is.na(x[["ovl_id"]]), ][["ovl_id"]]))
 
   if (isTRUE(verbose)) {
     cli::cli_alert_info(
diff --git a/R/segment.R b/R/segment.R
index 7381f24..430e658 100644
--- a/R/segment.R
+++ b/R/segment.R
@@ -29,13 +29,14 @@ segment <- function(x, max_len = 7900L, overlap = 500L){
   
   df[["esc"]] <- curl::curl_escape(df[["src"]])
   df[["begin_esc"]] <- cumsum(c(1L, (nchar(df$esc) + 3L)[1L:(nrow(df) - 1L)]))
+  df[["end_esc"]] <- df[["begin_esc"]] + nchar(df[["esc"]])
   
   # The total number of characters of the escaped string is the beginning of 
   # the last offset plus the nchar of the last token
   nchar_esc <- df$begin_esc[nrow(df)] + nchar(df$esc[nrow(df)]) - 1L
   
   # based on paper & pencil math
-  n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap))
+  n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap)) + 2
   
   if (n_segments > 1){
     half <- floor(max_len / 2)
@@ -57,13 +58,13 @@ segment <- function(x, max_len = 7900L, overlap = 500L){
         from <- if (i == 1L){
           1L
         } else {
-          max(which(df[["begin_esc"]] <= (anchors[i] - half)))
+          min(which(df[["begin_esc"]] > (anchors[i] - half)))
         }
         
         to <- if (i == length(anchors)){
           nrow(df)
         } else {
-          min(which(df[["begin_esc"]] >= (anchors[i] + half)))
+          max(which(df[["end_esc"]] < (anchors[i] + half)))
         }
         df[from:to,]
       }
@@ -75,5 +76,13 @@ segment <- function(x, max_len = 7900L, overlap = 500L){
     segments <- list(x)
     names(segments) <- as.character(1)
   }
-  as.character(segments)
+  
+  nchar_seg_esc <- nchar(
+    unlist(lapply(lapply(y, `[[`, "esc"), paste, collapse = "%20"))
+  )
+  
+  if (any(nchar_seg_esc > max_len))
+    cli_alert_warning("segments exceed `max_len`")
+
+  unlist(segments)
 }
diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd
index 5068776..f10c38e 100644
--- a/man/get_dbpedia_uris.Rd
+++ b/man/get_dbpedia_uris.Rd
@@ -17,6 +17,8 @@ get_dbpedia_uris(x, ...)
   x,
   language = getOption("dbpedia.lang"),
   max_len = 5600L,
+  overlap = 500L,
+  offset = 1L,
   confidence = 0.35,
   api = getOption("dbpedia.endpoint"),
   retry = TRUE,
@@ -31,6 +33,7 @@ get_dbpedia_uris(x, ...)
   x,
   language = getOption("dbpedia.lang"),
   max_len = 5600L,
+  overlap = 1000L,
   confidence = 0.35,
   api = getOption("dbpedia.endpoint"),
   retry = TRUE,
@@ -46,6 +49,7 @@ get_dbpedia_uris(x, ...)
   p_attribute = "word",
   s_attribute = NULL,
   max_len = 5600L,
+  overlap = 1000L,
   confidence = 0.35,
   api = getOption("dbpedia.endpoint"),
   retry = TRUE,
@@ -69,6 +73,7 @@ get_dbpedia_uris(x, ...)
   types = character(),
   support = 20,
   max_len = 5600L,
+  overlap = 1000L,
   expand_to_token = FALSE,
   verbose = TRUE,
   progress = FALSE
@@ -78,6 +83,7 @@ get_dbpedia_uris(x, ...)
   x,
   language = getOption("dbpedia.lang"),
   max_len = 5600L,
+  overlap = 1000L,
   confidence = 0.35,
   api = getOption("dbpedia.endpoint"),
   retry = TRUE,
@@ -96,6 +102,7 @@ get_dbpedia_uris(x, ...)
   token_tags = c("w", "pc"),
   text_tag = NULL,
   max_len = 5600L,
+  overlap = 1000L,
   confidence = 0.35,
   api = getOption("dbpedia.endpoint"),
   retry = TRUE,
@@ -120,6 +127,12 @@ determine the stopwords used.}
 not exceed a defined length. If it does, an HTTP error results. The known
 threshold of 5600 characters is the default value.}
 
+\item{overlap}{If the input string \code{x} is longer than \code{max_len}, the numnber
+of overlapping characters (passed into \code{segment()}).}
+
+\item{offset}{An integer value with the base offset position of the text to
+be annotated.}
+
 \item{confidence}{A \code{numeric} value, the minimum similarity score that serves
 as threshold before DBpedia Spotlight includes a link into the report.}
 
diff --git a/man/segment.Rd b/man/segment.Rd
index d7fd5e0..2ff699c 100644
--- a/man/segment.Rd
+++ b/man/segment.Rd
@@ -15,8 +15,8 @@ processed.}
 \item{overlap}{Number of overlapping characters.}
 }
 \value{
-A named list of character vectors. The names are integer numbers that
-indicate the character offset from the original string.
+A named character vector. The names are integer numbers that indicate
+the character offset from the original string.
 }
 \description{
 Strings that are too long to be processed by DBpedia Spotlight are cut into
diff --git a/tests/testthat/test-segment.R b/tests/testthat/test-segment.R
index ce847bc..37e2fb9 100644
--- a/tests/testthat/test-segment.R
+++ b/tests/testthat/test-segment.R
@@ -27,4 +27,35 @@ test_that(
     expect_identical(nchar(article), nchar(article_reconstructed))
     expect_identical(article, article_reconstructed)
   }
+)
+
+
+test_that(
+  "identity of results",
+  {
+    article <- corpus("REUTERS") %>%
+      polmineR::subset(id == "236") %>% # the longest article in the REUTERS corpus
+      get_token_stream(p_attribute = "word", collapse = " ")
+    
+    dbpedia_uris_ref <- get_dbpedia_uris(
+      x = article,
+      api = "http://api.dbpedia-spotlight.org/en/annotate",
+      language = "en",
+      verbose = FALSE,
+      max_len = 7500L
+    )
+    
+    dbpedia_uris_seg <- get_dbpedia_uris(
+      x = article,
+      api = "http://api.dbpedia-spotlight.org/en/annotate",
+      language = "en",
+      verbose = FALSE,
+      max_len = 2000,
+      overlap = 750
+    )
+    
+    expect_identical(dbpedia_uris_ref, dbpedia_uris_seg)
+    
+    expect_identical(dbpedia_uris_ref, dbpedia_uris_seg)
+  }
 )
\ No newline at end of file