-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
process segmented vectors with overlength
- Loading branch information
Andreas Blätte
authored and
Andreas Blätte
committed
Apr 10, 2024
1 parent
e3f3c41
commit 759940d
Showing
8 changed files
with
200 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#' Cut long string into overlapping segments | ||
#' | ||
#' Strings that are too long to be processed by DBpedia Spotlight are cut into | ||
#' overlapping segments that can be processed. Overlaps ensure that contextual | ||
#' information is available for all entities. | ||
#' | ||
#' @return A named character vector. The names are integer numbers that indicate | ||
#' the character offset from the original string. | ||
#' @param x A string (length-one character vector) to process. | ||
#' @param max_len The maximum length of (URL-escaped!) string that can be | ||
#' processed. | ||
#' @param overlap Number of overlapping characters. | ||
#' @export | ||
#' @importFrom curl curl_escape | ||
#' @examples | ||
#' library(polmineR) | ||
#' use("RcppCWB") # make REUTERS corpus available | ||
#' | ||
#' article <- corpus("REUTERS") %>% | ||
#' subset(id == "236") %>% # the longest article in the REUTERS corpus | ||
#' get_token_stream(p_attribute = "word", collapse = " ") | ||
#' | ||
#' segs <- segment(x = article, max_len = 500, overlap = 100) | ||
segment <- function(x, max_len = 7900L, overlap = 500L){ | ||
# check that length(x) == 1L | ||
|
||
df <- data.frame(src = strsplit(x, split = " ")[[1]]) | ||
df[["begin"]] <- cumsum(c(1L, (nchar(df$src) + 1L)[1L:(nrow(df) - 1L)])) | ||
|
||
df[["esc"]] <- curl::curl_escape(df[["src"]]) | ||
df[["begin_esc"]] <- cumsum(c(1L, (nchar(df$esc) + 3L)[1L:(nrow(df) - 1L)])) | ||
|
||
# The total number of characters of the escaped string is the beginning of | ||
# the last offset plus the nchar of the last token | ||
nchar_esc <- df$begin_esc[nrow(df)] + nchar(df$esc[nrow(df)]) - 1L | ||
|
||
# based on paper & pencil math | ||
n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap)) | ||
|
||
if (n_segments > 1){ | ||
half <- floor(max_len / 2) | ||
last <- nchar_esc - half | ||
anchors <- c(half, last) | ||
|
||
if (n_segments > 2){ | ||
anchors <- sort(c( | ||
anchors, | ||
half + cumsum( | ||
rep((last - half) / (n_segments - 1), times = n_segments - 2) | ||
) | ||
)) | ||
} | ||
|
||
y <- lapply( | ||
seq_along(anchors), | ||
function(i){ | ||
from <- if (i == 1L){ | ||
1L | ||
} else { | ||
max(which(df[["begin_esc"]] <= (anchors[i] - half))) | ||
} | ||
|
||
to <- if (i == length(anchors)){ | ||
nrow(df) | ||
} else { | ||
min(which(df[["begin_esc"]] >= (anchors[i] + half))) | ||
} | ||
df[from:to,] | ||
} | ||
) | ||
|
||
segments <- lapply(lapply(y, `[[`, "src"), paste, collapse = " ") | ||
names(segments) <- lapply(y, `[`, 1, "begin") | ||
} else { | ||
segments <- list(x) | ||
names(segments) <- as.character(1) | ||
} | ||
as.character(segments) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
library(polmineR) | ||
use("RcppCWB") # make REUTERS corpus available | ||
|
||
test_that( | ||
"ensure that segments add up to original string", | ||
{ | ||
article <- corpus("REUTERS") %>% | ||
polmineR::subset(id == "236") %>% # the longest article in the REUTERS corpus | ||
get_token_stream(p_attribute = "word", collapse = " ") | ||
|
||
segs <- segment(x = article, max_len = 500, overlap = 100) | ||
|
||
# we grow the reconstructed string ... | ||
article_reconstructed <- character() | ||
for (i in seq_along(segs)){ | ||
article_reconstructed <- paste( | ||
substr( | ||
article_reconstructed, | ||
start = 1L, | ||
stop = as.integer(names(segs)[[i]]) - 1L | ||
), | ||
segs[[i]], | ||
sep = "" | ||
) | ||
} | ||
|
||
expect_identical(nchar(article), nchar(article_reconstructed)) | ||
expect_identical(article, article_reconstructed) | ||
} | ||
) |