Prep for CRAN release

RoheLab · Aug 31, 2023 · bc15efe · bc15efe
1 parent da6a935
commit bc15efe
Show file tree

Hide file tree

Showing 13 changed files with 317 additions and 174 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,8 +11,8 @@ Authors@R: c(
 Description: Cross-validated eigenvalues are estimated by
     splitting a graph into two parts, the training and the test graph.
     The training graph is use to estimate eigenvectors, and
-    the test graph is use to evaluate the correlation between the sample
-    eigenvectors and the eigenspace of the test graph.
+    the test graph is use to evaluate the correlation between the training
+    eigenvectors and the eigenvectors of the test graph.
     The correlations follow a simple central limit theorem that can
     be used to estimate graph dimension via hypothesis testing.
 License: GPL (>= 3)

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,23 +2,7 @@
 
 S3method(plot,eigcv)
 S3method(print,eigcv)
+export("%>%")
 export(eigcv)
 import(Matrix)
-importFrom(dplyr,group_by)
-importFrom(dplyr,mutate)
-importFrom(dplyr,select)
-importFrom(dplyr,summarise)
-importFrom(dplyr,summarize)
-importFrom(dplyr,ungroup)
-importFrom(ggplot2,aes)
-importFrom(ggplot2,geom_hline)
-importFrom(ggplot2,geom_line)
-importFrom(ggplot2,geom_point)
-importFrom(ggplot2,ggplot)
-importFrom(ggplot2,labs)
-importFrom(ggplot2,scale_color_manual)
-importFrom(ggplot2,scale_x_continuous)
-importFrom(ggplot2,theme)
-importFrom(ggplot2,theme_bw)
 importFrom(magrittr,"%>%")
-importFrom(rlang,.data)
diff --git a/R/eigcv.R b/R/eigcv.R
@@ -1,9 +1,10 @@
+
+# assumes elements of A are non-negative integers (and Poisson)
 split_graph <- function(A, test_portion = 0.1) {
 
   A <- methods::as(A, "dMatrix")
   A <- methods::as(A, "TsparseMatrix")
 
-  # assumes elements of A are Poisson (i.e. non-negative integers)
   stopifnot(isTRUE(all.equal(A@x, as.integer(A@x))))
   test_edges <- stats::rbinom(length(A@x), size = A@x, prob = test_portion)
 
@@ -41,9 +42,11 @@ glaplacian <- function(A, regularize = TRUE) {
   L
 }
 
-# Given the trained left/right singular vectors, compute the test statistic for
+# given the training singular vectors, compute the test statistic for
 # graph dimension.
 gdstat <- function(full, test, u, v, test_portion) {
+
+  # standard error calculation is different for directed and undirected graphs
   if (isSymmetric(full)) {
     se <- sqrt(2 * test_portion * as.numeric(t(u^2) %*% full %*% v^2) -
       test_portion * sum(diag(full) * u^2 * v^2))
@@ -59,41 +62,56 @@ gdstat <- function(full, test, u, v, test_portion) {
 
 
 
-#' Edge Bootstrapping and Splitting
+#' Compute cross-validate eigenvalues
 #'
-#' Estimate the graph dimension via eigenvalue cross-validation (EigCV).
+#' Estimate graph dimension via eigenvalue cross-validation (EigCV).
 #' A graph has dimension `k` if the first `k` eigenvectors of its adjacency
 #' matrix are correlated with its population eigenspace, and the others are not.
 #' Edge bootstrapping sub-samples the edges of the graph (without replacement).
 #' Edge splitting separates the edges into a training part and a testing part.
 #'
-#' @param A The adjacency matrix of graph. Must be non-negative integer valued.
-#' @param k_max `integer(1)`, number of eigenvectors to compute.
-#' @param num_bootstraps `integer(1)`, number of graph bootstraps, default to 10.
-#'   Graph bootstrapping is to account for the randomness in graph splitting,
-#'   rather than obtaining any statistic (as a traditional num_bootstraps does).
-#'   Hence, a small number (e.g., 3~10) of bootstraps usually suffices.
-#'   If `num_bootstraps>1`, the test statistics will be averaged across bootstraps
+#' @param A The adjacency matrix of graph. Must be non-negative and
+#'  integer valued.
+#' @param k_max The maximum dimension of the graph to consider. This many
+#'   eigenvectors are computed. Should be a non-negative integer smallish
+#'   relative the dimensions of `A`.
+#' @param ... Ignored.
+#' @param num_bootstraps The number of times to bootstrap the graph. Since
+#'   cross-validated eigenvalues are based on a random graph split, they
+#'   are themselves random. By repeatedly computing cross-validated eigenvalues
+#'   for different sample splits, the idea is to smooth away some of the
+#'   randomness due to the graph splits. A small number of bootstraps
+#'   (3 to 10) usually suffices. Defaults to `10`. Test statistics (i.e.
+#'   z-scores for cv eigenvalues) are averaged across bootstraps
 #'   and the p-values will be calculated based on the averaged statistics.
-#' @param alpha Significance level of each test, defaults to `0.05`.
-#'   This is used to cut off the dimension estimation.
-#' @param ptol `numeric(1)`, the tolerance of minimal p-value.
-#' @param regularize TODO
-#' @param test_portion TODO
-#' @inheritParams stats::p.adjust
-#' @param laplacian `logical(1)`, use the normalized and regularized adjacency
-#'   matrix (i.e. L)
-#'   This option is experimental and should be used with caution.
-#' @return A `eigcv` object, which contains:
-#'   \item{estimated_dimension}{inferred graph dimension.}
-#'   \item{summary}{summary table of the tests.}
-#'   \item{num_bootstraps}{number of bootstraps performed.}
-#'   \item{test_portion}{graph splitting probability used.}
-#'   \item{alpha}{significance level of each test.}
+#' @param test_portion The portion of the graph to put into the test graph,
+#'   as opposed to the training graph. Defaults to `0.1`. Must be strictly
+#'   between zero and one.
+#' @param alpha Significance level for hypothesis tests. Each dimension
+#'   `1, ..., k_max` is tested when estimating graph dimension, and the
+#'   overall graph dimension is taken to be the smallest number of dimensions
+#'   such that all the tests reject.
+#' @param method Method to adjust p-values for multiple testing. Must be
+#'   one of `"none"`, `"holm"`, `"hochberg"`, `"hommel"`, `"bonferroni"`,
+#'   `"BH"`, `"BY"`, or `"fdr"`. Passed to [stats::p.adjust()]. Defaults to
+#'   `"none"`.
+#' @param laplacian Logical value indicating where to compute cross-validated
+#'   eigenvalues for the degree-normalize graph Laplacian rather than the
+#'   graph adjacency matrix. Experimental and should be used with caution.
+#'   Defaults to `FALSE`.
+#' @param regularize Only applicable when `laplacian == TRUE`, in which case
+#'   this parameter controls whether or not the degree-normalized graph
+#'   Laplacian is regularized. Defaults to `TRUE`.
+#'
+#' @return A `eigcv` object, which is a list with the following named
+#'   elements.
+#'
+#'   - `estimated_dimension`: inferred graph dimension.
+#'   - `summary`: summary table of the tests.
+#'   - `num_bootstraps`: number of bootstraps performed.
+#'   - `test_portion`: graph splitting probability used.
+#'   - `alpha`: significance level of each test.
 #'
-#' @importFrom dplyr summarize group_by ungroup mutate summarise
-#' @importFrom magrittr %>%
-#' @importFrom rlang .data
 #' @export
 #'
 #' @examples
@@ -116,16 +134,19 @@ gdstat <- function(full, test, u, v, test_portion) {
 #'
 #' A <- sample_sparse(model)
 #'
-#' eigcv_result <- eigcv(A, k_max = 10)
-#' eigcv_result
+#' eigs<- eigcv(A, k_max = 10)
+#' eigs
+#'
+#' plot(eigs, type = "z-score")    # default
+#' plot(eigs, type = "adjacency")
+#' plot(eigs, type = "laplacian")
+#'
 #'
 eigcv <- function(A, k_max,
                   ...,
                   num_bootstraps = 10, test_portion = 0.1,
                   alpha = 0.05,
-                  ptol = .Machine$double.eps,
-                  method = c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr",
-                             "none"),
+                  method = c("none", "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"),
                   laplacian = FALSE,
                   regularize = TRUE) {
   n <- min(dim(A))
@@ -182,29 +203,23 @@ eigcv <- function(A, k_max,
     pb$tick()
   }
 
-  ## summarize across CV/num_bootstraps
-  if (num_bootstraps > 1) {
-    cv_means <- cv_stats %>%
-      group_by(.data$k) %>%
-      summarise(
-        cv_lambda_A = mean(.data$cv_lambda_A),
-        cv_lambda_L = mean(.data$cv_lambda_L),
-        z = mean(.data$z)
-      ) %>%
-      ungroup()
-  } else {
-    cv_means <- cv_stats
-  }
+  cv_means <- cv_stats %>%
+    dplyr::group_by(k) %>%
+    dplyr::summarise(
+      cv_lambda_A = mean(cv_lambda_A),
+      cv_lambda_L = mean(cv_lambda_L),
+      z = mean(z)
+    ) %>%
+    dplyr::ungroup()
+
+
   cv_means <- cv_means %>%
-    mutate(
-      pvals = stats::pnorm(.data$z, lower.tail = FALSE),
-      pvals = pmax(.data$pvals, ptol)
-    ) ## avoid exact 0
+    dplyr::mutate(
+      pvals = stats::pnorm(z, lower.tail = FALSE)
+    )
 
-  ## correct for multiplicity
   cv_means$padj <- stats::p.adjust(cv_means$pvals, method = method)
 
-  ## inference
   criteria <- cv_means$padj
   k_stop <- which(criteria > alpha)
   k_infer <- ifelse(length(k_stop), min(k_stop) - 1, k_max)
@@ -221,69 +236,80 @@ eigcv <- function(A, k_max,
 }
 
 
-#' Print `eigcv`
+#' Print cross-validated eigenvalues
 #'
-#' @method print eigcv
-#'
-#' @param x an `eigcv` object.
+#' @param x An `eigcv` object created by a call to [eigcv()].
 #' @param ... Ignored.
 #' @export
+#'
+#' @inherit eigcv examples
+#' @return `x`, but invisibly.
+#'
+#' @method print eigcv
 print.eigcv <- function(x,  ...) {
   cat("Estimated graph dimension:\t", x$estimated_dimension, fill = TRUE)
-    cat("\nNumber of bootstraps:\t\t", x$num_bootstraps, fill = TRUE)
-    cat("Edge splitting probabaility:\t", x$test_portion, fill = TRUE)
-    cat("Significance level:\t\t", x$alpha, fill = TRUE)
-    cat("\n ------------ Summary of Tests ------------\n")
-    print(data.frame(x$summary[, -c(2, 3)]), row.names = FALSE)
-    cat(fill = TRUE)
+  cat("\nNumber of bootstraps:\t\t", x$num_bootstraps, fill = TRUE)
+  cat("Edge splitting probabaility:\t", x$test_portion, fill = TRUE)
+  cat("Significance level:\t\t", x$alpha, fill = TRUE)
+  cat("\n ------------ Summary of Tests ------------\n")
+  print(data.frame(x$summary[, -c(2, 3)]), row.names = FALSE)
+  cat(fill = TRUE)
+  invisible(x)
 }
 
-#' Plot `eigcv`
+#' Plot cross-validated eigenvalues
 #'
-#' @method plot eigcv
+#' @param x An `eigcv` object created by a call to [eigcv()].
+#' @param type Specifies what to plot. Must be one of the following options:
+#'
+#'    - `"z-score"`, in which case the Z-statistic test scores are plotted
+#'      for each value of `k` (i.e. dimension of the eigenspace).
+#'    - `"adjacency"` in which case the cross-validated eigenvalues of the
+#'      adjacency matrix are plotted for each value of `k`.
+#'    - `"laplacian"` in which case the cross-validated eigenvalues of the
+#'      graph Laplacian matrix are plotted for each value of `k`.
 #'
-#' @param x an `eigcv` object.
-#' @param type either "z", "A", or "L" to specify the y-axis of the plot.
-#'   If "z", plot the test statistics (asymptotic z score) for each k.
-#'   If "A", plot x'Ax for each eigenvector x.
-#'   If "L", plot x'Lx for each eigenvector x.
-#' @param threshold `numeric(1)`, cut-off of p-value (in log10), default to 2.
-#' @param ... ignored.
-#' @return Plot an `eigcv` object.
-#' @importFrom ggplot2 ggplot aes labs theme_bw theme scale_color_manual
-#' @importFrom ggplot2 geom_hline geom_point geom_line scale_x_continuous
-#' @importFrom magrittr %>%
-#' @importFrom dplyr select
-#' @importFrom rlang .data
+#' @param threshold Only used when `type == "z-score"`. Adds a horizontal
+#'   line at the value of `threshold`, which should be a numeric of length
+#'   one. Defaults to `2`.
+#' @param ... Ignored.
+#' @return A `ggplot2` object.
 #' @export
-plot.eigcv <- function(x, type = c("z", "A", "L"), threshold = 2, ...) {
+#' @method plot eigcv
+#'
+#' @inherit eigcv examples
+plot.eigcv <- function(x, type = c("z-score", "adjacency", "laplacian"), threshold = 2, ...) {
   stopifnot("Threshold of statistics must be greater than 0." = threshold > 0)
 
+  type <- rlang::arg_match(type)
+
   type <- type[1]
-  if (type == "z") {
-    dat <- x$summary %>% select(.data$k, val = .data$z)
+  if (type == "z-score") {
+    dat <- dplyr::select(x$summary, k, val = z)
     ylab <- "z score"
   }
-  if (type == "A") {
-    dat <- x$summary %>% select(.data$k, val = .data$cv_lambda_A)
+  if (type == "adjacency") {
+    dat <- dplyr::select(x$summary, k, val = cv_lambda_A)
     ylab <- "cross validated x' A x"
   }
-  if (type == "L") {
-    dat <- x$summary %>% select(.data$k, val = .data$cv_lambda_L)
+  if (type == "laplacian") {
+    dat <- dplyr::select(x$summary, k, val = cv_lambda_L)
     ylab <- "cross validated x' L x"
   }
 
-  g <- ggplot(aes(.data$k, .data$val), data = dat) +
-    geom_point(alpha = .8) +
-    geom_line(color = "blue") +
-    theme_bw() +
-    scale_x_continuous(breaks = function(x) {
-      unique(floor(pretty(seq(0, (max(x) + 1) * 1.1))))
-    })
-
-  if (type == "z") {
+  g <- ggplot2::ggplot(ggplot2::aes(k, val), data = dat) +
+    ggplot2::geom_point(alpha = .8) +
+    ggplot2::geom_line(color = "blue") +
+    ggplot2::theme_bw() +
+    ggplot2::scale_x_continuous(
+      breaks = function(x) {
+        unique(floor(pretty(seq(0, (max(x) + 1) * 1.1))))
+      }
+    )
+
+  if (type == "z-score") {
     g <- g +
-      geom_hline(
+      ggplot2::geom_hline(
         yintercept = threshold, alpha = .8,
         linetype = 2, color = "grey60", show.legend = TRUE
       )

diff --git a/R/utils.R b/R/utils.R
@@ -0,0 +1,24 @@
+#' Pipe operator
+#'
+#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
+#'
+#' @name %>%
+#' @rdname pipe
+#' @keywords internal
+#' @export
+#' @importFrom magrittr %>%
+#' @usage lhs \%>\% rhs
+#' @param lhs A value or the magrittr placeholder.
+#' @param rhs A function call using the magrittr semantics.
+#' @return The result of calling `rhs(lhs)`.
+NULL
+
+utils::globalVariables(
+  c(
+    "cv_lambda_A",
+    "cv_lambda_L",
+    "k",
+    "val",
+    "z"
+  )
+)
diff --git a/README.Rmd b/README.Rmd
@@ -21,7 +21,7 @@ knitr::opts_chunk$set(
 [![CRAN status](https://www.r-pkg.org/badges/version/gdim)](https://CRAN.R-project.org/package=gdim)
 <!-- badges: end -->
 
-`gdim` estimates graph dimension using cross-validated eigenvalues, via the graph-splitting technique developed in <http://arxiv.org/abs/2108.03336>. Theoretically, the method works by computing a special type of cross-validated eigenvalue which follows a simple central limit theorem. This allows users to perform hypothesis tests on the rank of the graph.
+`gdim` estimates graph dimension using cross-validated eigenvalues, via the graph-splitting technique developed in <https://arxiv.org/abs/2108.03336>. Theoretically, the method works by computing a special type of cross-validated eigenvalue which follows a simple central limit theorem. This allows users to perform hypothesis tests on the rank of the graph.
 
 ## Installation
 
@@ -77,4 +77,4 @@ plot(eigcv_result)
 
 ## Reference
 
-Chen, Fan, Sebastien Roch, Karl Rohe, and Shuqi Yu. “Estimating Graph Dimension with Cross-Validated Eigenvalues.” ArXiv:2108.03336 [Cs, Math, Stat], August 6, 2021. http://arxiv.org/abs/2108.03336.
+Chen, Fan, Sebastien Roch, Karl Rohe, and Shuqi Yu. “Estimating Graph Dimension with Cross-Validated Eigenvalues.” ArXiv:2108.03336 [Cs, Math, Stat], August 6, 2021. https://arxiv.org/abs/2108.03336.