Skip to content

Commit

Permalink
process segmented vectors with overlength
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Apr 10, 2024
1 parent e3f3c41 commit 759940d
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 4 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: dbpedia
Type: Package
Title: R Wrapper for DBpedia Spotlight
Version: 0.1.2.9005
Date: 2024-03-31
Date: 2024-04-10
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
person("Christoph", "Leonhardt", role = "aut")
Expand All @@ -17,6 +17,7 @@ Imports:
data.table,
tibble,
httr,
curl,
jsonlite,
RcppCWB,
fs,
Expand Down Expand Up @@ -52,6 +53,7 @@ Collate:
'xml.R'
'entity_types.R'
'overlaps.R'
'segment.R'
'zzz.R'
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export(dbpedia_spotlight_status)
export(detect_overlap)
export(namespaced_xpath)
export(resolve_overlap)
export(segment)
export(sparql_query)
export(wikidata_query)
export(xml_enrich)
Expand All @@ -34,6 +35,7 @@ importFrom(cli,cli_text)
importFrom(cli,col_cyan)
importFrom(cli,format_error)
importFrom(cli,style_bold)
importFrom(curl,curl_escape)
importFrom(data.table,`:=`)
importFrom(data.table,as.data.table)
importFrom(data.table,data.table)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## dbpedia v0.1.2.9005

* A new auxiliary function `segment()` generates overlapping segments of text for strings longer than the maximum nchar that can be processed by DBpedia Spotlight.
* Method `get_dbpedia_uris()` has new argument `overlap` passed into `segment()`.


## dbpedia v0.1.2.9004

* Method `get_dbpedia_uris()` has new argument `retry` to retry if API is stalled #45 and new argument `logfile` for tracking and debugging longrunning annotation tasks. If the annotation failes, `NULL` is returned (no abort).
Expand Down
47 changes: 44 additions & 3 deletions R/dbpedia.R
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ setMethod(
x,
language = getOption("dbpedia.lang"),
max_len = 5600L,
overlap = 500L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
retry = TRUE,
Expand All @@ -370,11 +371,49 @@ setMethod(
verbose = TRUE
) {

if (nchar(x) > max_len) {
nchar_escaped <- nchar(curl::curl_escape(x))
if (nchar_escaped > max_len) {
if (verbose) cli_alert_warning(
"input text has length {nchar(x)}, truncate to max_len ({.val {max_len}})"
"number of characters of escaped input string is {.val {nchar_escaped}} - will process segmented string"
)
x <- substr(x, 1L, max_len)
segs <- segment(x = x, max_len = max_len, overlap = overlap)
dts <- lapply(
segs,
function(seg){
get_dbpedia_uris(
x = seg,
language = language,
max_len = max_len, # input 'seg' must be below this threshold
overlap = overlap, # may not be needed
confidence = confidence,
api = api,
retry = retry,
logfile = logfile,
types = types,
support = support,
types_src = types_src,
verbose = verbose

)
}
)

pos <- as.integer(names(segs))
for (i in seq_along(dts)){
if (i == 1){
breakpoint <- (nchar(dts[[1L]]) - pos[2L]) / 2
dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint]
} else if (i == length(dts)){
breakpoint <- ((pos[i - 1L] + nchar(segs[i - 1L]) - 1L) - pos[i]) / 2
dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint]
} else {
breakpoint_l <- ((pos[i - 1L] + nchar(segs[i - 1L] - 1L)) - pos[i]) / 2
breakpoint_r <- ((pos[i] + nchar(segs[i] - 1L)) - pos[i + 1]) / 2
dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint_l]
dts[[i]] <- dts[[i]][dts[[i]][["start"]] < breakpoint_r]
}
}
return(rbindlist(dts))
}

if (!is.numeric(support) | !(length(support) == 1)) {
Expand Down Expand Up @@ -585,6 +624,8 @@ setMethod(
#' @param max_len An `integer` value. The text passed to DBpedia Spotlight may
#' not exceed a defined length. If it does, an HTTP error results. The known
#' threshold of 5600 characters is the default value.
#' @param overlap If the input string `x` is longer than `max_len`, the numnber
#' of overlapping characters (passed into `segment()`).
#' @param language The language of the input text ("en", "fr", "de", ...) to
#' determine the stopwords used.
#' @param confidence A `numeric` value, the minimum similarity score that serves
Expand Down
79 changes: 79 additions & 0 deletions R/segment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#' Cut long string into overlapping segments
#'
#' Strings that are too long to be processed by DBpedia Spotlight are cut into
#' overlapping segments that can be processed. Overlaps ensure that contextual
#' information is available for all entities.
#'
#' @return A named character vector. The names are integer numbers that indicate
#' the character offset from the original string.
#' @param x A string (length-one character vector) to process.
#' @param max_len The maximum length of (URL-escaped!) string that can be
#' processed.
#' @param overlap Number of overlapping characters.
#' @export
#' @importFrom curl curl_escape
#' @examples
#' library(polmineR)
#' use("RcppCWB") # make REUTERS corpus available
#'
#' article <- corpus("REUTERS") %>%
#' subset(id == "236") %>% # the longest article in the REUTERS corpus
#' get_token_stream(p_attribute = "word", collapse = " ")
#'
#' segs <- segment(x = article, max_len = 500, overlap = 100)
segment <- function(x, max_len = 7900L, overlap = 500L){
# check that length(x) == 1L

df <- data.frame(src = strsplit(x, split = " ")[[1]])
df[["begin"]] <- cumsum(c(1L, (nchar(df$src) + 1L)[1L:(nrow(df) - 1L)]))

df[["esc"]] <- curl::curl_escape(df[["src"]])
df[["begin_esc"]] <- cumsum(c(1L, (nchar(df$esc) + 3L)[1L:(nrow(df) - 1L)]))

# The total number of characters of the escaped string is the beginning of
# the last offset plus the nchar of the last token
nchar_esc <- df$begin_esc[nrow(df)] + nchar(df$esc[nrow(df)]) - 1L

# based on paper & pencil math
n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap))

if (n_segments > 1){
half <- floor(max_len / 2)
last <- nchar_esc - half
anchors <- c(half, last)

if (n_segments > 2){
anchors <- sort(c(
anchors,
half + cumsum(
rep((last - half) / (n_segments - 1), times = n_segments - 2)
)
))
}

y <- lapply(
seq_along(anchors),
function(i){
from <- if (i == 1L){
1L
} else {
max(which(df[["begin_esc"]] <= (anchors[i] - half)))
}

to <- if (i == length(anchors)){
nrow(df)
} else {
min(which(df[["begin_esc"]] >= (anchors[i] + half)))
}
df[from:to,]
}
)

segments <- lapply(lapply(y, `[[`, "src"), paste, collapse = " ")
names(segments) <- lapply(y, `[`, 1, "begin")
} else {
segments <- list(x)
names(segments) <- as.character(1)
}
as.character(segments)
}
1 change: 1 addition & 0 deletions man/get_dbpedia_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions man/segment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions tests/testthat/test-segment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
library(polmineR)
use("RcppCWB") # make REUTERS corpus available

test_that(
"ensure that segments add up to original string",
{
article <- corpus("REUTERS") %>%
polmineR::subset(id == "236") %>% # the longest article in the REUTERS corpus
get_token_stream(p_attribute = "word", collapse = " ")

segs <- segment(x = article, max_len = 500, overlap = 100)

# we grow the reconstructed string ...
article_reconstructed <- character()
for (i in seq_along(segs)){
article_reconstructed <- paste(
substr(
article_reconstructed,
start = 1L,
stop = as.integer(names(segs)[[i]]) - 1L
),
segs[[i]],
sep = ""
)
}

expect_identical(nchar(article), nchar(article_reconstructed))
expect_identical(article, article_reconstructed)
}
)

0 comments on commit 759940d

Please sign in to comment.