From 0a3e089637bb05108dbe8df6bab6e56bb136679d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 7 Jul 2023 14:16:58 +0200 Subject: [PATCH] as_docgroups() tested --- DESCRIPTION | 3 +- NAMESPACE | 3 +- NEWS.md | 5 + R/{nchars.R => charcount.R} | 0 R/detect_duplicates.R | 60 ++++++++-- R/encode.R | 118 ++++++++++++++++++ R/utils.R | 151 ------------------------ man/{nchars.Rd => charcount.Rd} | 38 +++--- man/detect_duplicates.Rd | 8 +- man/docgroups.Rd | 14 +++ man/duplicates_as_annotation_data.Rd | 2 +- man/duplicates_encode.Rd | 2 +- man/duplicates_get_groups.Rd | 14 --- tests/testthat/test_detect_duplicates.R | 10 +- tests/testthat/test_nchars.R | 4 +- vignettes/vignette.Rmd | 2 +- 16 files changed, 231 insertions(+), 203 deletions(-) rename R/{nchars.R => charcount.R} (100%) create mode 100644 R/encode.R rename man/{nchars.Rd => charcount.Rd} (64%) create mode 100644 man/docgroups.Rd delete mode 100644 man/duplicates_get_groups.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 72d550f..38ae1fe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,8 +31,9 @@ Encoding: UTF-8 License: GPL-3 Collate: 'duplicates_package.R' - 'nchars.R' + 'charcount.R' 'detect_duplicates.R' 'utils.R' + 'encode.R' RoxygenNote: 7.2.3 Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index e842451..bc76ff6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,10 @@ # Generated by roxygen2: do not edit by hand +export(as_docgroups) export(charfilter) export(detect_duplicates) +exportMethods(charcount) exportMethods(detect_duplicates) -exportMethods(nchars) import(data.table) importFrom(Matrix,triu) importFrom(R6,R6Class) diff --git a/NEWS.md b/NEWS.md index 3a12b6e..0048b04 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,11 @@ ## v0.1.3 - Function `minimize_vocabulary()` more generic and renamed as `charfilter()`. +- Method `nchars()` renamed to `charcount()`. +- Function `duplicates_get_groups()` renamed to `as_docgroups()`. +- Argument `s_attribute` of method `detect_duplicates()` used generically. A new +column with the name of the the s-attribute to be used as metadata will be +added. ## v0.1.2 diff --git a/R/nchars.R b/R/charcount.R similarity index 100% rename from R/nchars.R rename to R/charcount.R diff --git a/R/detect_duplicates.R b/R/detect_duplicates.R index afdac9d..195e875 100644 --- a/R/detect_duplicates.R +++ b/R/detect_duplicates.R @@ -56,7 +56,7 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica #' use(pkg = "duplicates") #' #' charcount <- corpus("REUTERS2") %>% -#' nchars( +#' charcount( #' p_attribute = "word", #' char_regex = "[a-zA-Z]", #' lowercase = TRUE, @@ -77,6 +77,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica #' mc = parallel::detectCores() - 2L, #' vocab = vocab #' ) +#' +#' docgrps <- as_docgroups(dupl) setMethod("detect_duplicates", "partition_bundle", function( x, n = 5L, min_shingle_length = n, @@ -87,10 +89,10 @@ setMethod("detect_duplicates", "partition_bundle", ){ started <- Sys.time() - if (verbose) cli_progress_step("get sizes and dates") + if (verbose) cli_progress_step("get sizes and metadata") sizes <- sapply(x@objects, slot, "size") - dates <- s_attributes(x, s_attribute = s_attribute, unique = TRUE) - dates <- lapply(dates, `[[`, 1L) # a sanity measure + s_attr <- s_attributes(x, s_attribute = s_attribute, unique = TRUE) + s_attr <- lapply(s_attr, `[[`, 1L) # a sanity measure if (verbose) cli_progress_step("make ngram matrix") ngrams <- ngrams( @@ -113,10 +115,11 @@ setMethod("detect_duplicates", "partition_bundle", dt[, "duplicate_size" := sizes[dt[["duplicate_name"]]]] if (nrow(dt) > 0L){ - dt[, "date" := unlist(dates[dt[["name"]]])] - dt[, "date_duplicate" := unlist(dates[dt[["duplicate_name"]]])] + dt[, (s_attribute) := unlist(s_attr[dt[["name"]]])] + dt[, (paste("duplicate", s_attribute, sep = "_")) := unlist(s_attr[dt[["duplicate_name"]]])] } else { - dt[, "date" := character()][, "date_duplicate" := character()] + dt[, (s_attribute) := character()] + dt[, (paste("duplicate", s_attribute, sep = "_")) := character()] } dt } @@ -136,7 +139,9 @@ setMethod("detect_duplicates", "partition_bundle", #' chars <- chars[grep("[a-zA-Z]", names(chars))] #' char <- names(chars[order(chars, decreasing = FALSE)][1:20]) #' -#' detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6) +#' dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6) +#' +#' docgrps <- as_docgroups(dupl) #' @rdname detect_duplicates setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ started <- Sys.time() @@ -203,4 +208,41 @@ setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, t duplicate_name = sim_min@Dimnames[[2]][sim_min@j + 1], similarity = sim_min@x ) -}) \ No newline at end of file +}) + + +#' Get groups of near-duplicate documents +#' +#' @param x A `data.table` with duplicates that have been detected. +#' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute +#' @export as_docgroups +#' @rdname docgroups +as_docgroups <- function(x){ + + ids <- x[, c("name", "duplicate_name")] |> + as.data.frame() |> + igraph::graph_from_data_frame() |> + igraph::decompose() |> + lapply(igraph::get.vertex.attribute, name = "name") + + dt <- data.table( + name = unlist(ids), + group = unlist( + mapply(rep, seq_along(ids), sapply(ids, length), SIMPLIFY = FALSE), + recursive = FALSE + ) + ) + + duplcols <- grep("duplicate_", colnames(x), value = TRUE) + metadata <- unique(rbindlist( + list( + x[, setdiff(colnames(x), c(duplcols, "similarity")), with = FALSE], + x[, duplcols, with = FALSE] + ), + use.names = FALSE + )) + + y <- metadata[dt, on = "name"] + setcolorder(y, neworder = c("group", "name")) + y +} diff --git a/R/encode.R b/R/encode.R new file mode 100644 index 0000000..9e205b6 --- /dev/null +++ b/R/encode.R @@ -0,0 +1,118 @@ + +#' Encode annotation data +#' +#' Add structural attributes to CWB corpus based on the annotation data that +#' has been generated. +#' @param x Data. +#' @param corpus ID of CWB corpus. +#' @param method XXX. +#' @importFrom data.table setDT +#' @importFrom cwbtools s_attribute_encode +duplicates_encode <- function(x, corpus, method = "R"){ + + corpus_obj <- corpus(corpus) + + for (s_attr in c("is_duplicate", "duplicates")){ + s_attribute_encode( + values = as.character(x[[s_attr]]), + data_dir = corpus_obj@data_dir, + s_attribute = s_attr, + corpus = corpus, + region_matrix = as.matrix(x[, c("cpos_left", "cpos_right")]), + method = method, + registry_dir = corpus_obj@registry_dir, + encoding = corpus_obj@encoding, + delete = TRUE, + verbose = TRUE + ) + } + invisible(TRUE) +} + +.N <- NULL # to avoid warnings + +#' Make annotation data +#' +#' @description +#' Turn `data.table` with duplicates into file with corpus positions and +#' annotation of duplicates. +#' @param drop A character vector of document IDs that will be removed from +#' the annotation data. Useful for removing known noise that will be +#' excluded from the analysis otherwise. +#' @param cols XXX. +#' @param order XXX. +#' @param x Input `data.table`. +#' @param corpus ID of CWB corpus. +#' @param s_attribute Structural attribute to annotate. +#' @importFrom data.table setDT setnames setkeyv +#' @importFrom polmineR corpus +duplicates_as_annotation_data = function(x, corpus, s_attribute, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){ + + groups <- as_docgroups() + + if (!is.null(drop)){ + groups <- groups[!groups[["name"]] %in% drop] + groups_n <- groups[, .N, by = "group"] + groups[groups_n, "group_size" := groups_n[["N"]], on = "group"] + groups <- groups[groups[["group_size"]] > 1L][, "group_size" := NULL] + } + + original <- groups[, + setorderv(x = .SD, cols = cols, order = order)[1,], + by = "group", .SDcols = cols + ][, "is_duplicate" := FALSE] + groups[original, "is_duplicate" := groups[["is_duplicate"]], on = "name"] + groups[, "is_duplicate" := ifelse(is.na(groups[["is_duplicate"]]), TRUE, groups[["is_duplicate"]])] + duplicates_dt <- groups[, + list( + name = .SD[["name"]], + is_duplicate = .SD[["is_duplicate"]], + duplicates = sapply( + 1L:nrow(.SD), + function(i) paste(setdiff(.SD[["name"]], .SD[["name"]][i]), collapse = "|") + ) + ), + by = "group", .SDcols = c("name", "is_duplicate") + ][, "group" := NULL] + + # get regions ------------------------------------------------------------ + + corpus_obj <- corpus(corpus) + x <- corpus(corpus) + regions <- setDT( + RcppCWB::s_attribute_decode( + corpus = corpus, + data_dir = corpus_obj@data_dir, + s_attribute = s_attribute, + encoding = corpus_obj@encoding, + registry = corpus_obj@registry_dir, + method = "Rcpp" + ) + ) + setnames(regions, old = "value", new = s_attribute) + setkeyv(regions, s_attribute) + + # finalize annotation data ----------------------------------------------- + + setnames(duplicates_dt, old = "name", new = s_attribute) + anno <- duplicates_dt[regions, on = s_attribute] + anno[, + "is_duplicate" := ifelse( + is.na(anno[["is_duplicate"]]), + FALSE, + anno[["is_duplicate"]] + ) + ] + anno[, + "duplicates" := ifelse( + is.na(anno[["duplicates"]]), + "", + anno[["duplicates"]] + )] + setcolorder( + anno, + c("cpos_left", "cpos_right", s_attribute, "is_duplicate", "duplicates") + ) + setorderv(anno, cols = "cpos_left") + anno +} diff --git a/R/utils.R b/R/utils.R index f0a7f61..3432b61 100644 --- a/R/utils.R +++ b/R/utils.R @@ -18,154 +18,3 @@ charfilter <- function(x, chars){ ) unlist(vocab, recursive = FALSE) } - -#' Get duplicate groups -#' -#' @param x A `data.table` with duplicates that have been detected. -#' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute -duplicates_get_groups <- function(x){ - - ids <- x[, c("name", "duplicate_name")] |> - as.data.frame() |> - igraph::graph_from_data_frame() |> - igraph::decompose() |> - lapply(igraph::get.vertex.attribute, name = "name") - - dt <- data.table( - name = unlist(ids), - group = unlist( - mapply(rep, seq_along(ids), sapply(ids, length)), - recursive = FALSE - ) - ) - - metadata <- unique(rbindlist( - list( - x[, c("name", "size", "date")], - x[, c("duplicate_name", "duplicate_size", "date_duplicate")] - ), - use.names = FALSE - )) - - y <- metadata[dt, on = "name"] - setcolorder(y, neworder = c("group", "name", "date", "size")) - y -} - -#' Encode annotation data -#' -#' Add structural attributes to CWB corpus based on the annotation data that -#' has been generated. -#' @param x Data. -#' @param corpus ID of CWB corpus. -#' @param method XXX. -#' @importFrom data.table setDT -#' @importFrom cwbtools s_attribute_encode -duplicates_encode <- function(x, corpus, method = "R"){ - - corpus_obj <- corpus(corpus) - - for (s_attr in c("is_duplicate", "duplicates")){ - s_attribute_encode( - values = as.character(x[[s_attr]]), - data_dir = corpus_obj@data_dir, - s_attribute = s_attr, - corpus = corpus, - region_matrix = as.matrix(x[, c("cpos_left", "cpos_right")]), - method = method, - registry_dir = corpus_obj@registry_dir, - encoding = corpus_obj@encoding, - delete = TRUE, - verbose = TRUE - ) - } - invisible(TRUE) -} - -.N <- NULL # to avoid warnings - -#' Make annotation data -#' -#' @description -#' Turn `data.table` with duplicates into file with corpus positions and -#' annotation of duplicates. -#' @param drop A character vector of document IDs that will be removed from -#' the annotation data. Useful for removing known noise that will be -#' excluded from the analysis otherwise. -#' @param cols XXX. -#' @param order XXX. -#' @param x Input `data.table`. -#' @param corpus ID of CWB corpus. -#' @param s_attribute Structural attribute to annotate. -#' @importFrom data.table setDT setnames setkeyv -#' @importFrom polmineR corpus -duplicates_as_annotation_data = function(x, corpus, s_attribute, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){ - - groups <- duplicates_get_groups() - - if (!is.null(drop)){ - groups <- groups[!groups[["name"]] %in% drop] - groups_n <- groups[, .N, by = "group"] - groups[groups_n, "group_size" := groups_n[["N"]], on = "group"] - groups <- groups[groups[["group_size"]] > 1L][, "group_size" := NULL] - } - - original <- groups[, - setorderv(x = .SD, cols = cols, order = order)[1,], - by = "group", .SDcols = cols - ][, "is_duplicate" := FALSE] - groups[original, "is_duplicate" := groups[["is_duplicate"]], on = "name"] - groups[, "is_duplicate" := ifelse(is.na(groups[["is_duplicate"]]), TRUE, groups[["is_duplicate"]])] - duplicates_dt <- groups[, - list( - name = .SD[["name"]], - is_duplicate = .SD[["is_duplicate"]], - duplicates = sapply( - 1L:nrow(.SD), - function(i) paste(setdiff(.SD[["name"]], .SD[["name"]][i]), collapse = "|") - ) - ), - by = "group", .SDcols = c("name", "is_duplicate") - ][, "group" := NULL] - - # get regions ------------------------------------------------------------ - - corpus_obj <- corpus(corpus) - x <- corpus(corpus) - regions <- setDT( - RcppCWB::s_attribute_decode( - corpus = corpus, - data_dir = corpus_obj@data_dir, - s_attribute = s_attribute, - encoding = corpus_obj@encoding, - registry = corpus_obj@registry_dir, - method = "Rcpp" - ) - ) - setnames(regions, old = "value", new = s_attribute) - setkeyv(regions, s_attribute) - - # finalize annotation data ----------------------------------------------- - - setnames(duplicates_dt, old = "name", new = s_attribute) - anno <- duplicates_dt[regions, on = s_attribute] - anno[, - "is_duplicate" := ifelse( - is.na(anno[["is_duplicate"]]), - FALSE, - anno[["is_duplicate"]] - ) - ] - anno[, - "duplicates" := ifelse( - is.na(anno[["duplicates"]]), - "", - anno[["duplicates"]] - )] - setcolorder( - anno, - c("cpos_left", "cpos_right", s_attribute, "is_duplicate", "duplicates") - ) - setorderv(anno, cols = "cpos_left") - anno -} diff --git a/man/nchars.Rd b/man/charcount.Rd similarity index 64% rename from man/nchars.Rd rename to man/charcount.Rd index afa7d05..417205d 100644 --- a/man/nchars.Rd +++ b/man/charcount.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/nchars.R -\name{nchars} -\alias{nchars} -\alias{nchars,partition-method} -\alias{nchars,subcorpus-method} -\alias{nchars,partition_bundle-method} -\alias{nchars,subcorpus_bundle-method} -\alias{nchars,corpus-method} +% Please edit documentation in R/charcount.R +\name{charcount} +\alias{charcount} +\alias{charcount,partition-method} +\alias{charcount,subcorpus-method} +\alias{charcount,partition_bundle-method} +\alias{charcount,subcorpus_bundle-method} +\alias{charcount,corpus-method} \title{Count the number of characters} \usage{ -nchars(x, ...) +charcount(x, ...) -\S4method{nchars}{partition}( +\S4method{charcount}{partition}( x, p_attribute = "word", char_regex = "[a-zA-Z]", @@ -19,7 +19,7 @@ nchars(x, ...) decreasing = TRUE ) -\S4method{nchars}{subcorpus}( +\S4method{charcount}{subcorpus}( x, p_attribute = "word", char_regex = "[a-zA-Z]", @@ -27,11 +27,11 @@ nchars(x, ...) decreasing = TRUE ) -\S4method{nchars}{partition_bundle}(x, mc = FALSE, progress = TRUE, decreasing = TRUE, ...) +\S4method{charcount}{partition_bundle}(x, mc = FALSE, progress = TRUE, decreasing = TRUE, ...) -\S4method{nchars}{subcorpus_bundle}(x, decreasing = TRUE, mc = FALSE, progress = TRUE, ...) +\S4method{charcount}{subcorpus_bundle}(x, decreasing = TRUE, mc = FALSE, progress = TRUE, ...) -\S4method{nchars}{corpus}( +\S4method{charcount}{corpus}( x, p_attribute = "word", lowercase = TRUE, @@ -67,20 +67,20 @@ library(polmineR) use("RcppCWB") partition("REUTERS", id = "127") \%>\% - nchars() + charcount() corpus("REUTERS") \%>\% subset(id == "127") \%>\% - nchars() + charcount() corpus("REUTERS") \%>\% partition_bundle(s_attribute = "id") \%>\% - nchars() + charcount() corpus("REUTERS") \%>\% split(s_attribute = "id") \%>\% - nchars() + charcount() library(polmineR) use("RcppCWB") -n <- corpus("REUTERS") \%>\% nchars(decreasing = FALSE) +n <- corpus("REUTERS") \%>\% charcount(decreasing = FALSE) } diff --git a/man/detect_duplicates.Rd b/man/detect_duplicates.Rd index 00f0d26..73fe128 100644 --- a/man/detect_duplicates.Rd +++ b/man/detect_duplicates.Rd @@ -88,7 +88,7 @@ library(polmineR) use(pkg = "duplicates") charcount <- corpus("REUTERS2") \%>\% - nchars( + charcount( p_attribute = "word", char_regex = "[a-zA-Z]", lowercase = TRUE, @@ -109,6 +109,8 @@ dupl <- detect_duplicates( mc = parallel::detectCores() - 2L, vocab = vocab ) + +docgrps <- as_docgroups(dupl) library(polmineR) use(pkg = "duplicates") @@ -120,5 +122,7 @@ chars <- table(tolower(strsplit(paste(unlist(x), collapse = ""), "")[[1]])) chars <- chars[grep("[a-zA-Z]", names(chars))] char <- names(chars[order(chars, decreasing = FALSE)][1:20]) -detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6) +dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6) + +docgrps <- as_docgroups(dupl) } diff --git a/man/docgroups.Rd b/man/docgroups.Rd new file mode 100644 index 0000000..3f921d9 --- /dev/null +++ b/man/docgroups.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/detect_duplicates.R +\name{as_docgroups} +\alias{as_docgroups} +\title{Get groups of near-duplicate documents} +\usage{ +as_docgroups(x) +} +\arguments{ +\item{x}{A \code{data.table} with duplicates that have been detected.} +} +\description{ +Get groups of near-duplicate documents +} diff --git a/man/duplicates_as_annotation_data.Rd b/man/duplicates_as_annotation_data.Rd index 083171c..8734014 100644 --- a/man/duplicates_as_annotation_data.Rd +++ b/man/duplicates_as_annotation_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/encode.R \name{duplicates_as_annotation_data} \alias{duplicates_as_annotation_data} \title{Make annotation data} diff --git a/man/duplicates_encode.Rd b/man/duplicates_encode.Rd index 5e04d51..64a0d1d 100644 --- a/man/duplicates_encode.Rd +++ b/man/duplicates_encode.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/encode.R \name{duplicates_encode} \alias{duplicates_encode} \title{Encode annotation data} diff --git a/man/duplicates_get_groups.Rd b/man/duplicates_get_groups.Rd deleted file mode 100644 index 6abf07b..0000000 --- a/man/duplicates_get_groups.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{duplicates_get_groups} -\alias{duplicates_get_groups} -\title{Get duplicate groups} -\usage{ -duplicates_get_groups(x) -} -\arguments{ -\item{x}{A \code{data.table} with duplicates that have been detected.} -} -\description{ -Get duplicate groups -} diff --git a/tests/testthat/test_detect_duplicates.R b/tests/testthat/test_detect_duplicates.R index 031a359..43fa004 100644 --- a/tests/testthat/test_detect_duplicates.R +++ b/tests/testthat/test_detect_duplicates.R @@ -8,7 +8,7 @@ test_that( "run duplicate detection", { charcount <- corpus("REUTERS2") %>% - nchars( + charcount( p_attribute = "word", char_regex = "[a-zA-Z]", lowercase = TRUE, @@ -55,5 +55,13 @@ test_that( dupl[, c("name", "duplicate_name", "similarity")], dupl2 ) + + # -------------------------------------------------------------------------- + + docgroups1 <- as_docgroups(dupl) + docgroups2 <- as_docgroups(dupl2) + + expect_identical(docgroups1[["group"]], docgroups2[["group"]]) + expect_identical(docgroups1[["name"]], docgroups2[["name"]]) } ) diff --git a/tests/testthat/test_nchars.R b/tests/testthat/test_nchars.R index 84d5457..e7444c7 100644 --- a/tests/testthat/test_nchars.R +++ b/tests/testthat/test_nchars.R @@ -5,10 +5,10 @@ use("duplicates") testthat::context("detect_duplicates") test_that( - "crosscheck nchars", + "crosscheck charcount", { charcount1 <- corpus("REUTERS2") %>% - nchars( + charcount( p_attribute = "word", char_regex = "[a-zA-Z]", lowercase = TRUE, diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index 2d762a5..44686aa 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -23,7 +23,7 @@ library(duplicates) use(pkg = "duplicates") charcount <- corpus("REUTERS2") %>% - nchars( + charcount( p_attribute = "word", char_regex = "[a-zA-Z]", lowercase = TRUE,