From b54f663f2671796bde866693c5c549b9b11403ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 9 Jul 2023 22:03:21 +0200 Subject: [PATCH] as_docgroups() renamed to docgroups() --- DESCRIPTION | 8 +++--- NAMESPACE | 6 ++-- NEWS.md | 5 ++++ R/{encode.R => annodata.R} | 4 +-- R/{detect_duplicates.R => docsimil.R} | 34 +++++++++++------------ man/docgroups.Rd | 8 +++--- man/{detect_duplicates.Rd => docsimil.Rd} | 28 +++++++++---------- man/duplicates_as_annotation_data.Rd | 6 ++-- tests/testthat/test_detect_duplicates.R | 10 +++---- tests/testthat/test_nchars.R | 2 +- vignettes/vignette.Rmd | 6 ++-- 11 files changed, 61 insertions(+), 56 deletions(-) rename R/{encode.R => annodata.R} (97%) rename R/{detect_duplicates.R => docsimil.R} (90%) rename man/{detect_duplicates.Rd => docsimil.Rd} (82%) diff --git a/DESCRIPTION b/DESCRIPTION index 809d65a..da5ba95 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: duplicates Type: Package Title: Near duplicate detection -Version: 0.1.3 -Date: 2023-07-03 +Version: 0.1.4 +Date: 2023-07-09 Author: Andreas Blaette Maintainer: Andreas Blaette Additional_repositories: https://polmine.github.io/drat @@ -32,8 +32,8 @@ License: GPL-3 Collate: 'duplicates_package.R' 'charcount.R' - 'detect_duplicates.R' + 'docsimil.R' 'utils.R' - 'encode.R' + 'annodata.R' RoxygenNote: 7.2.3 Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index c5fd21e..b061379 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,11 @@ # Generated by roxygen2: do not edit by hand -export(as_docgroups) export(charfilter) -export(detect_duplicates) +export(docgroups) +export(docsimil) export(duplicates_as_annotation_data) exportMethods(charcount) -exportMethods(detect_duplicates) +exportMethods(docsimil) import(data.table) importFrom(Matrix,triu) importFrom(R6,R6Class) diff --git a/NEWS.md b/NEWS.md index 41f3df0..fa57e7e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +## v0.1.4 + +- Method `detect_duplicates()` renamed to `docsimil()`. +- Function `as_docgroups()` renamed to `docgroups()`. + ## v0.1.3 - Function `minimize_vocabulary()` more generic and renamed as `charfilter()`. diff --git a/R/encode.R b/R/annodata.R similarity index 97% rename from R/encode.R rename to R/annodata.R index 73ce82a..bef0c5f 100644 --- a/R/encode.R +++ b/R/annodata.R @@ -30,7 +30,7 @@ #' x <- corpus("REUTERS2") |> #' split(s_attribute = "doc_id") #' -#' dupl <- detect_duplicates( +#' dupl <- docsimil( #' x = x, #' p_attribute = "word", #' s_attribute = "doc_id", @@ -38,7 +38,7 @@ #' vocab = vocab #' ) #' -#' grps <- as_docgroups(dupl) +#' grps <- docgroups(dupl) #' #' annodata <- duplicates_as_annotation_data( #' x = grps, diff --git a/R/detect_duplicates.R b/R/docsimil.R similarity index 90% rename from R/detect_duplicates.R rename to R/docsimil.R index 4308330..8131af6 100644 --- a/R/detect_duplicates.R +++ b/R/docsimil.R @@ -1,5 +1,5 @@ -#' @rdname detect_duplicates -setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplicates")) +#' @rdname docsimil +setGeneric("docsimil", function(x, ...) standardGeneric("docsimil")) #' Detect Duplicates @@ -34,8 +34,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica #' @param verbose A `logical` value, whether to be verbose. #' @param ... Further arguments (unused). #' @param vocab Pruned vocabulary. -#' @export detect_duplicates -#' @rdname detect_duplicates +#' @export docsimil +#' @rdname docsimil #' @importFrom parallel mclapply #' @importFrom pbapply pblapply #' @importFrom stats setNames @@ -70,7 +70,7 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica #' x <- corpus("REUTERS2") |> #' split(s_attribute = "doc_id") #' -#' dupl <- detect_duplicates( +#' dupl <- docsimil( #' x = x, #' p_attribute = "word", #' s_attribute = "doc_id", @@ -78,8 +78,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica #' vocab = vocab #' ) #' -#' docgrps <- as_docgroups(dupl) -setMethod("detect_duplicates", "partition_bundle", +#' docgrps <- docgroups(dupl) +setMethod("docsimil", "partition_bundle", function( x, n = 5L, min_shingle_length = n, p_attribute = "word", s_attribute = "text_date", @@ -103,7 +103,7 @@ setMethod("detect_duplicates", "partition_bundle", weigh(method = "tfidf") |> as.sparseMatrix() - dt <- detect_duplicates( + dt <- docsimil( x = m, n = n, min_shingle_length = min_shingle_length, @@ -139,11 +139,11 @@ setMethod("detect_duplicates", "partition_bundle", #' chars <- chars[grep("[a-zA-Z]", names(chars))] #' char <- names(chars[order(chars, decreasing = FALSE)][1:20]) #' -#' dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6) +#' dupl <- docsimil(x = x, n = 5L, char = char, threshold = 0.6) #' -#' docgrps <- as_docgroups(dupl, cols = "name", order = 1L) -#' @rdname detect_duplicates -setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ +#' docgrps <- docgroups(dupl, cols = "name", order = 1L) +#' @rdname docsimil +setMethod("docsimil", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ started <- Sys.time() stopifnot(is.character(char)) @@ -173,7 +173,7 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = m <- weigh(tdm, method = "tfidf") |> as.sparseMatrix() - detect_duplicates( + docsimil( x = m, n = n, min_shingle_length = min_shingle_length, @@ -183,9 +183,9 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = } ) -#' @rdname detect_duplicates +#' @rdname docsimil #' @export -setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){ +setMethod("docsimil", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){ # Very short documents may result in shingle lengths below n, and this # may result in an undesired complete similarity. So drop short @@ -220,9 +220,9 @@ setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, t #' @param cols XXX. #' @param order XXX. #' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute -#' @export as_docgroups +#' @export docgroups #' @rdname docgroups -as_docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){ +docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){ ids <- x[, c("name", "duplicate_name")] |> as.data.frame() |> diff --git a/man/docgroups.Rd b/man/docgroups.Rd index e4cba1a..57b200c 100644 --- a/man/docgroups.Rd +++ b/man/docgroups.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/detect_duplicates.R -\name{as_docgroups} -\alias{as_docgroups} +% Please edit documentation in R/docsimil.R +\name{docgroups} +\alias{docgroups} \title{Get groups of near-duplicate documents} \usage{ -as_docgroups(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)) +docgroups(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)) } \arguments{ \item{x}{A \code{data.table} with duplicates that have been detected.} diff --git a/man/detect_duplicates.Rd b/man/docsimil.Rd similarity index 82% rename from man/detect_duplicates.Rd rename to man/docsimil.Rd index 10a4105..735041a 100644 --- a/man/detect_duplicates.Rd +++ b/man/docsimil.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/detect_duplicates.R -\name{detect_duplicates} -\alias{detect_duplicates} -\alias{detect_duplicates,partition_bundle-method} -\alias{detect_duplicates,list-method} -\alias{detect_duplicates,dgCMatrix-method} +% Please edit documentation in R/docsimil.R +\name{docsimil} +\alias{docsimil} +\alias{docsimil,partition_bundle-method} +\alias{docsimil,list-method} +\alias{docsimil,dgCMatrix-method} \title{Detect Duplicates} \usage{ -detect_duplicates(x, ...) +docsimil(x, ...) -\S4method{detect_duplicates}{partition_bundle}( +\S4method{docsimil}{partition_bundle}( x, n = 5L, min_shingle_length = n, @@ -21,7 +21,7 @@ detect_duplicates(x, ...) mc = FALSE ) -\S4method{detect_duplicates}{list}( +\S4method{docsimil}{list}( x, n = 5L, min_shingle_length = n, @@ -31,7 +31,7 @@ detect_duplicates(x, ...) mc = FALSE ) -\S4method{detect_duplicates}{dgCMatrix}(x, n, min_shingle_length, threshold, verbose) +\S4method{docsimil}{dgCMatrix}(x, n, min_shingle_length, threshold, verbose) } \arguments{ \item{x}{A \code{partition_bundle} or \code{subcorpus_bundle} object with documents to @@ -102,7 +102,7 @@ vocab <- corpus("REUTERS2") \%>\% x <- corpus("REUTERS2") |> split(s_attribute = "doc_id") -dupl <- detect_duplicates( +dupl <- docsimil( x = x, p_attribute = "word", s_attribute = "doc_id", @@ -110,7 +110,7 @@ dupl <- detect_duplicates( vocab = vocab ) -docgrps <- as_docgroups(dupl) +docgrps <- docgroups(dupl) library(polmineR) use(pkg = "duplicates") @@ -122,7 +122,7 @@ chars <- table(tolower(strsplit(paste(unlist(x), collapse = ""), "")[[1]])) chars <- chars[grep("[a-zA-Z]", names(chars))] char <- names(chars[order(chars, decreasing = FALSE)][1:20]) -dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6) +dupl <- docsimil(x = x, n = 5L, char = char, threshold = 0.6) -docgrps <- as_docgroups(dupl, cols = "name", order = 1L) +docgrps <- docgroups(dupl, cols = "name", order = 1L) } diff --git a/man/duplicates_as_annotation_data.Rd b/man/duplicates_as_annotation_data.Rd index cacfc5f..f498455 100644 --- a/man/duplicates_as_annotation_data.Rd +++ b/man/duplicates_as_annotation_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/encode.R +% Please edit documentation in R/annodata.R \name{duplicates_as_annotation_data} \alias{duplicates_as_annotation_data} \title{Make annotation data} @@ -36,7 +36,7 @@ vocab <- corpus("REUTERS2") \%>\% x <- corpus("REUTERS2") |> split(s_attribute = "doc_id") -dupl <- detect_duplicates( +dupl <- docsimil( x = x, p_attribute = "word", s_attribute = "doc_id", @@ -44,7 +44,7 @@ dupl <- detect_duplicates( vocab = vocab ) -grps <- as_docgroups(dupl) +grps <- docgroups(dupl) annodata <- duplicates_as_annotation_data( x = grps, diff --git a/tests/testthat/test_detect_duplicates.R b/tests/testthat/test_detect_duplicates.R index 47d37ad..4742df2 100644 --- a/tests/testthat/test_detect_duplicates.R +++ b/tests/testthat/test_detect_duplicates.R @@ -2,7 +2,7 @@ library(polmineR) library(duplicates) use("duplicates") -testthat::context("detect_duplicates") +testthat::context("docsimil") test_that( "run duplicate detection", @@ -24,7 +24,7 @@ test_that( x <- corpus("REUTERS2") |> polmineR::split(s_attribute = "doc_id") - dupl <- detect_duplicates( + dupl <- docsimil( x = x, p_attribute = "word", s_attribute = "doc_id", @@ -41,7 +41,7 @@ test_that( polmineR::split(s_attribute = "doc_id") %>% get_token_stream(p_attribute = "word", collapse = "") - dupl2 <- detect_duplicates( + dupl2 <- docsimil( x = x, n = 5L, char = names(charcount[1:20]), @@ -58,8 +58,8 @@ test_that( # -------------------------------------------------------------------------- - docgroups1 <- as_docgroups(dupl) - docgroups2 <- as_docgroups(dupl2, cols = "name", order = 1L) + docgroups1 <- docgroups(dupl) + docgroups2 <- docgroups(dupl2, cols = "name", order = 1L) expect_identical(docgroups1[["group"]], docgroups2[["group"]]) expect_identical(docgroups1[["name"]], docgroups2[["name"]]) diff --git a/tests/testthat/test_nchars.R b/tests/testthat/test_nchars.R index e7444c7..396be5a 100644 --- a/tests/testthat/test_nchars.R +++ b/tests/testthat/test_nchars.R @@ -2,7 +2,7 @@ library(polmineR) library(duplicates) use("duplicates") -testthat::context("detect_duplicates") +testthat::context("docsimil") test_that( "crosscheck charcount", diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index a432d93..6e51057 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -37,7 +37,7 @@ vocab <- corpus("REUTERS2") %>% x <- corpus("REUTERS2") |> split(s_attribute = "doc_id") -dupl <- detect_duplicates( +dupl <- docsimil( x = x, p_attribute = "word", s_attribute = "doc_id", @@ -54,7 +54,7 @@ dupl <- detect_duplicates( x <- corpus("REUTERS2") |> split(s_attribute = "doc_id") -dupl <- detect_duplicates( +dupl <- docsimil( x = x, p_attribute = "word", s_attribute = "doc_id", @@ -67,7 +67,7 @@ dupl <- detect_duplicates( ## Write to corpus ```{r} -groups <- as_docgroups(dupl) +groups <- docgroups(dupl) annodata <- duplicates_as_annotation_data( x = groups,