From b54f663f2671796bde866693c5c549b9b11403ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?=
 <andreasblatte@MBP-von-Andreas.fritz.box>
Date: Sun, 9 Jul 2023 22:03:21 +0200
Subject: [PATCH] as_docgroups() renamed to docgroups()

---
 DESCRIPTION                               |  8 +++---
 NAMESPACE                                 |  6 ++--
 NEWS.md                                   |  5 ++++
 R/{encode.R => annodata.R}                |  4 +--
 R/{detect_duplicates.R => docsimil.R}     | 34 +++++++++++------------
 man/docgroups.Rd                          |  8 +++---
 man/{detect_duplicates.Rd => docsimil.Rd} | 28 +++++++++----------
 man/duplicates_as_annotation_data.Rd      |  6 ++--
 tests/testthat/test_detect_duplicates.R   | 10 +++----
 tests/testthat/test_nchars.R              |  2 +-
 vignettes/vignette.Rmd                    |  6 ++--
 11 files changed, 61 insertions(+), 56 deletions(-)
 rename R/{encode.R => annodata.R} (97%)
 rename R/{detect_duplicates.R => docsimil.R} (90%)
 rename man/{detect_duplicates.Rd => docsimil.Rd} (82%)

diff --git a/DESCRIPTION b/DESCRIPTION
index 809d65a..da5ba95 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: duplicates
 Type: Package
 Title: Near duplicate detection
-Version: 0.1.3
-Date: 2023-07-03
+Version: 0.1.4
+Date: 2023-07-09
 Author: Andreas Blaette
 Maintainer: Andreas Blaette <andreas.blaette@uni-due.de>
 Additional_repositories: https://polmine.github.io/drat
@@ -32,8 +32,8 @@ License: GPL-3
 Collate:
     'duplicates_package.R'
     'charcount.R'
-    'detect_duplicates.R'
+    'docsimil.R'
     'utils.R'
-    'encode.R'
+    'annodata.R'
 RoxygenNote: 7.2.3
 Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
index c5fd21e..b061379 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,11 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
-export(as_docgroups)
 export(charfilter)
-export(detect_duplicates)
+export(docgroups)
+export(docsimil)
 export(duplicates_as_annotation_data)
 exportMethods(charcount)
-exportMethods(detect_duplicates)
+exportMethods(docsimil)
 import(data.table)
 importFrom(Matrix,triu)
 importFrom(R6,R6Class)
diff --git a/NEWS.md b/NEWS.md
index 41f3df0..fa57e7e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,8 @@
+## v0.1.4
+
+- Method `detect_duplicates()` renamed to `docsimil()`.
+- Function `as_docgroups()` renamed to `docgroups()`.
+
 ## v0.1.3
 
 - Function `minimize_vocabulary()` more generic and renamed as `charfilter()`.
diff --git a/R/encode.R b/R/annodata.R
similarity index 97%
rename from R/encode.R
rename to R/annodata.R
index 73ce82a..bef0c5f 100644
--- a/R/encode.R
+++ b/R/annodata.R
@@ -30,7 +30,7 @@
 #' x <- corpus("REUTERS2") |>
 #'   split(s_attribute = "doc_id")
 #' 
-#' dupl <- detect_duplicates(
+#' dupl <- docsimil(
 #'     x = x,
 #'     p_attribute = "word",
 #'     s_attribute = "doc_id",
@@ -38,7 +38,7 @@
 #'     vocab = vocab
 #'   )
 #' 
-#' grps <- as_docgroups(dupl)
+#' grps <- docgroups(dupl)
 #' 
 #' annodata <- duplicates_as_annotation_data(
 #'   x = grps,
diff --git a/R/detect_duplicates.R b/R/docsimil.R
similarity index 90%
rename from R/detect_duplicates.R
rename to R/docsimil.R
index 4308330..8131af6 100644
--- a/R/detect_duplicates.R
+++ b/R/docsimil.R
@@ -1,5 +1,5 @@
-#' @rdname detect_duplicates
-setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplicates"))
+#' @rdname docsimil
+setGeneric("docsimil", function(x, ...) standardGeneric("docsimil"))
 
 
 #' Detect Duplicates
@@ -34,8 +34,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
 #' @param verbose A `logical` value, whether to be verbose.
 #' @param ... Further arguments (unused).
 #' @param vocab Pruned vocabulary.
-#' @export detect_duplicates
-#' @rdname detect_duplicates
+#' @export docsimil
+#' @rdname docsimil
 #' @importFrom parallel mclapply
 #' @importFrom pbapply pblapply
 #' @importFrom stats setNames
@@ -70,7 +70,7 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
 #' x <- corpus("REUTERS2") |>
 #'   split(s_attribute = "doc_id")
 #' 
-#' dupl <- detect_duplicates(
+#' dupl <- docsimil(
 #'     x = x,
 #'     p_attribute = "word",
 #'     s_attribute = "doc_id",
@@ -78,8 +78,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
 #'     vocab = vocab
 #'   )
 #'   
-#' docgrps <- as_docgroups(dupl)
-setMethod("detect_duplicates", "partition_bundle",
+#' docgrps <- docgroups(dupl)
+setMethod("docsimil", "partition_bundle",
   function(
     x, n = 5L, min_shingle_length = n,
     p_attribute = "word", s_attribute = "text_date",
@@ -103,7 +103,7 @@ setMethod("detect_duplicates", "partition_bundle",
       weigh(method = "tfidf") |>
       as.sparseMatrix()
     
-    dt <- detect_duplicates(
+    dt <- docsimil(
       x = m,
       n = n,
       min_shingle_length = min_shingle_length,
@@ -139,11 +139,11 @@ setMethod("detect_duplicates", "partition_bundle",
 #' chars <- chars[grep("[a-zA-Z]", names(chars))]
 #' char <- names(chars[order(chars, decreasing = FALSE)][1:20])
 #' 
-#' dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
+#' dupl <- docsimil(x = x, n = 5L, char = char, threshold = 0.6)
 #' 
-#' docgrps <- as_docgroups(dupl, cols = "name", order = 1L)
-#' @rdname detect_duplicates
-setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ 
+#' docgrps <- docgroups(dupl, cols = "name", order = 1L)
+#' @rdname docsimil
+setMethod("docsimil", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ 
   started <- Sys.time()
   
   stopifnot(is.character(char))
@@ -173,7 +173,7 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length =
   
   m <- weigh(tdm, method = "tfidf") |> as.sparseMatrix()
   
-  detect_duplicates(
+  docsimil(
     x = m,
     n = n,
     min_shingle_length = min_shingle_length,
@@ -183,9 +183,9 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length =
 }
 )
 
-#' @rdname detect_duplicates
+#' @rdname docsimil
 #' @export
-setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){
+setMethod("docsimil", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){
   
   # Very short documents may result in shingle lengths below n, and this
   # may result in an undesired complete similarity. So drop short 
@@ -220,9 +220,9 @@ setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, t
 #' @param cols XXX.
 #' @param order XXX.
 #' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute
-#' @export as_docgroups
+#' @export docgroups
 #' @rdname docgroups
-as_docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){
+docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){
   
   ids <- x[, c("name", "duplicate_name")] |>
     as.data.frame() |>
diff --git a/man/docgroups.Rd b/man/docgroups.Rd
index e4cba1a..57b200c 100644
--- a/man/docgroups.Rd
+++ b/man/docgroups.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/detect_duplicates.R
-\name{as_docgroups}
-\alias{as_docgroups}
+% Please edit documentation in R/docsimil.R
+\name{docgroups}
+\alias{docgroups}
 \title{Get groups of near-duplicate documents}
 \usage{
-as_docgroups(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L))
+docgroups(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L))
 }
 \arguments{
 \item{x}{A \code{data.table} with duplicates that have been detected.}
diff --git a/man/detect_duplicates.Rd b/man/docsimil.Rd
similarity index 82%
rename from man/detect_duplicates.Rd
rename to man/docsimil.Rd
index 10a4105..735041a 100644
--- a/man/detect_duplicates.Rd
+++ b/man/docsimil.Rd
@@ -1,15 +1,15 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/detect_duplicates.R
-\name{detect_duplicates}
-\alias{detect_duplicates}
-\alias{detect_duplicates,partition_bundle-method}
-\alias{detect_duplicates,list-method}
-\alias{detect_duplicates,dgCMatrix-method}
+% Please edit documentation in R/docsimil.R
+\name{docsimil}
+\alias{docsimil}
+\alias{docsimil,partition_bundle-method}
+\alias{docsimil,list-method}
+\alias{docsimil,dgCMatrix-method}
 \title{Detect Duplicates}
 \usage{
-detect_duplicates(x, ...)
+docsimil(x, ...)
 
-\S4method{detect_duplicates}{partition_bundle}(
+\S4method{docsimil}{partition_bundle}(
   x,
   n = 5L,
   min_shingle_length = n,
@@ -21,7 +21,7 @@ detect_duplicates(x, ...)
   mc = FALSE
 )
 
-\S4method{detect_duplicates}{list}(
+\S4method{docsimil}{list}(
   x,
   n = 5L,
   min_shingle_length = n,
@@ -31,7 +31,7 @@ detect_duplicates(x, ...)
   mc = FALSE
 )
 
-\S4method{detect_duplicates}{dgCMatrix}(x, n, min_shingle_length, threshold, verbose)
+\S4method{docsimil}{dgCMatrix}(x, n, min_shingle_length, threshold, verbose)
 }
 \arguments{
 \item{x}{A \code{partition_bundle} or \code{subcorpus_bundle} object with documents to
@@ -102,7 +102,7 @@ vocab <- corpus("REUTERS2") \%>\%
 x <- corpus("REUTERS2") |>
   split(s_attribute = "doc_id")
 
-dupl <- detect_duplicates(
+dupl <- docsimil(
     x = x,
     p_attribute = "word",
     s_attribute = "doc_id",
@@ -110,7 +110,7 @@ dupl <- detect_duplicates(
     vocab = vocab
   )
   
-docgrps <- as_docgroups(dupl)
+docgrps <- docgroups(dupl)
 library(polmineR)
 use(pkg = "duplicates")
 
@@ -122,7 +122,7 @@ chars <- table(tolower(strsplit(paste(unlist(x), collapse = ""), "")[[1]]))
 chars <- chars[grep("[a-zA-Z]", names(chars))]
 char <- names(chars[order(chars, decreasing = FALSE)][1:20])
 
-dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
+dupl <- docsimil(x = x, n = 5L, char = char, threshold = 0.6)
 
-docgrps <- as_docgroups(dupl, cols = "name", order = 1L)
+docgrps <- docgroups(dupl, cols = "name", order = 1L)
 }
diff --git a/man/duplicates_as_annotation_data.Rd b/man/duplicates_as_annotation_data.Rd
index cacfc5f..f498455 100644
--- a/man/duplicates_as_annotation_data.Rd
+++ b/man/duplicates_as_annotation_data.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/encode.R
+% Please edit documentation in R/annodata.R
 \name{duplicates_as_annotation_data}
 \alias{duplicates_as_annotation_data}
 \title{Make annotation data}
@@ -36,7 +36,7 @@ vocab <- corpus("REUTERS2") \%>\%
 x <- corpus("REUTERS2") |>
   split(s_attribute = "doc_id")
 
-dupl <- detect_duplicates(
+dupl <- docsimil(
     x = x,
     p_attribute = "word",
     s_attribute = "doc_id",
@@ -44,7 +44,7 @@ dupl <- detect_duplicates(
     vocab = vocab
   )
 
-grps <- as_docgroups(dupl)
+grps <- docgroups(dupl)
 
 annodata <- duplicates_as_annotation_data(
   x = grps,
diff --git a/tests/testthat/test_detect_duplicates.R b/tests/testthat/test_detect_duplicates.R
index 47d37ad..4742df2 100644
--- a/tests/testthat/test_detect_duplicates.R
+++ b/tests/testthat/test_detect_duplicates.R
@@ -2,7 +2,7 @@ library(polmineR)
 library(duplicates)
 use("duplicates")
 
-testthat::context("detect_duplicates")
+testthat::context("docsimil")
 
 test_that(
   "run duplicate detection",
@@ -24,7 +24,7 @@ test_that(
     x <- corpus("REUTERS2") |>
       polmineR::split(s_attribute = "doc_id")
 
-    dupl <- detect_duplicates(
+    dupl <- docsimil(
         x = x,
         p_attribute = "word",
         s_attribute = "doc_id",
@@ -41,7 +41,7 @@ test_that(
       polmineR::split(s_attribute = "doc_id") %>%
       get_token_stream(p_attribute = "word", collapse = "")
 
-    dupl2 <- detect_duplicates(
+    dupl2 <- docsimil(
       x = x,
       n = 5L,
       char = names(charcount[1:20]),
@@ -58,8 +58,8 @@ test_that(
     
     # --------------------------------------------------------------------------
     
-    docgroups1 <- as_docgroups(dupl)
-    docgroups2 <- as_docgroups(dupl2, cols = "name", order = 1L)
+    docgroups1 <- docgroups(dupl)
+    docgroups2 <- docgroups(dupl2, cols = "name", order = 1L)
     
     expect_identical(docgroups1[["group"]], docgroups2[["group"]])
     expect_identical(docgroups1[["name"]], docgroups2[["name"]])
diff --git a/tests/testthat/test_nchars.R b/tests/testthat/test_nchars.R
index e7444c7..396be5a 100644
--- a/tests/testthat/test_nchars.R
+++ b/tests/testthat/test_nchars.R
@@ -2,7 +2,7 @@ library(polmineR)
 library(duplicates)
 use("duplicates")
 
-testthat::context("detect_duplicates")
+testthat::context("docsimil")
 
 test_that(
   "crosscheck charcount",
diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd
index a432d93..6e51057 100644
--- a/vignettes/vignette.Rmd
+++ b/vignettes/vignette.Rmd
@@ -37,7 +37,7 @@ vocab <- corpus("REUTERS2") %>%
 x <- corpus("REUTERS2") |>
   split(s_attribute = "doc_id")
 
-dupl <- detect_duplicates(
+dupl <- docsimil(
     x = x,
     p_attribute = "word",
     s_attribute = "doc_id",
@@ -54,7 +54,7 @@ dupl <- detect_duplicates(
 x <- corpus("REUTERS2") |>
   split(s_attribute = "doc_id")
 
-dupl <- detect_duplicates(
+dupl <- docsimil(
     x = x,
     p_attribute = "word",
     s_attribute = "doc_id",
@@ -67,7 +67,7 @@ dupl <- detect_duplicates(
 ## Write to corpus
 
 ```{r}
-groups <- as_docgroups(dupl)
+groups <- docgroups(dupl)
 
 annodata <- duplicates_as_annotation_data(
   x = groups,