StatFunGen · xueweic · May 25, 2026 · May 24, 2026 · May 24, 2026 · May 25, 2026
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
@@ -120,34 +120,67 @@ get_hierarchical_clusters <- function(cormat, min_cluster_corr = 0.8) {
 #' @noRd
 get_modularity <- function(Weight, B) {
   if (dim(Weight)[1] == 1) return(0)
-
+  if (nrow(B) != nrow(Weight)) {
+    stop("B must have one row per variable in Weight")
+  }
+  if (any(rowSums(B) == 0)) {
+    stop("Each row of B must belong to one cluster")
+  }
+  index <- max.col(B, ties.method = "first")
+  get_modularity_partition(get_modularity_components(Weight), index, ncol(B))
+}
+
+get_modularity_components <- function(Weight) {
   W_pos <- Weight * (Weight > 0)
   W_neg <- Weight * (Weight < 0)
-  N <- dim(Weight)[1]
   K_pos <- colSums(W_pos)
   K_neg <- colSums(W_neg)
   m_pos <- sum(K_pos)
   m_neg <- sum(K_neg)
-  m <- m_pos + m_neg
-
-  if (m == 0) return(0)
-
-  # cate <- B %*% t(B)
-  cate <- tcrossprod(B)
-
-  if (m_pos == 0 & m_neg == 0) return(0)
-
-  if (m_pos == 0) {
+  list(
+    W_pos = W_pos,
+    W_neg = W_neg,
+    K_pos = K_pos,
+    K_neg = K_neg,
+    m_pos = m_pos,
+    m_neg = m_neg,
+    m = m_pos + m_neg
+  )
+}
+
+get_modularity_partition <- function(mod_components, index, n_cluster) {
+  if (nrow(mod_components$W_pos) == 1) return(0)
+  if (mod_components$m == 0) return(0)
+
+  index_factor <- factor(index, levels = seq_len(n_cluster))
+  block_sum <- function(W) {
+    row_sums <- rowsum(W, index_factor, reorder = TRUE)
+    vapply(seq_len(n_cluster), function(i) sum(row_sums[i, index == i]), numeric(1))
+  }
+  group_sum <- function(x) {
+    as.numeric(rowsum(matrix(x, ncol = 1), index_factor, reorder = TRUE))
+  }
+
+  A_pos <- block_sum(mod_components$W_pos)
+  A_neg <- block_sum(mod_components$W_neg)
+  D_pos <- group_sum(mod_components$K_pos)
+  D_neg <- group_sum(mod_components$K_neg)
+
+  if (mod_components$m_pos == 0 & mod_components$m_neg == 0) return(0)
+
+  if (mod_components$m_pos == 0) {
     Q_positive <- 0
-    Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
-  } else if (m_neg == 0) {
-    Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
+    Q_negative <- sum(A_neg - D_neg^2 / mod_components$m_neg) / mod_components$m_neg
+  } else if (mod_components$m_neg == 0) {
+    Q_positive <- sum(A_pos - D_pos^2 / mod_components$m_pos) / mod_components$m_pos
     Q_negative <- 0
   } else {
-    Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
-    Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
+    Q_positive <- sum(A_pos - D_pos^2 / mod_components$m_pos) / mod_components$m_pos
+    Q_negative <- sum(A_neg - D_neg^2 / mod_components$m_neg) / mod_components$m_neg
   }
-  Q <- m_pos / m * Q_positive - m_neg / m * Q_negative
+
+  Q <- mod_components$m_pos / mod_components$m * Q_positive -
+    mod_components$m_neg / mod_components$m * Q_negative
   return(Q)
 }
 
@@ -165,15 +198,16 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) {
     IND <- 1
     Q <- 1
   } else {
-    Q <- c()
+    Q <- numeric(m)
     if (ncol(Sigma) < 10) {
       m <- ncol(Sigma)
+      Q <- numeric(m)
     }
+    mod_components <- get_modularity_components(Sigma)
     for (i in 1:m)
     {
       index <- cutree(hc, i)
-      B <- sapply(1:i, function(t) as.numeric(index == t))
-      Q[i] <- get_modularity(Sigma, B)
+      Q[i] <- get_modularity_partition(mod_components, index, i)
     }
 
     IND <- which(Q == max(Q))

diff --git a/tests/testthat/helper-plot-device.R b/tests/testthat/helper-plot-device.R
@@ -0,0 +1,12 @@
+.colocboost_plot_device_env <- new.env(parent = emptyenv())
+.colocboost_plot_device_env$file <- tempfile("colocboost-test-plots-", fileext = ".pdf")
+grDevices::pdf(.colocboost_plot_device_env$file)
+
+reg.finalizer(.colocboost_plot_device_env, function(e) {
+  while (grDevices::dev.cur() > 1) {
+    grDevices::dev.off()
+  }
+  unlink(e$file)
+  unlink("Rplots.pdf")
+  unlink(file.path("tests", "testthat", "Rplots.pdf"))
+}, onexit = TRUE)
diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R
@@ -237,6 +237,76 @@ test_that("get_hierarchical_clusters functions correctly", {
   expect_equal(result_all_high$n_cluster, 1)
 })
 
+test_that("block-sum modularity path matches reference modularity", {
+  set.seed(20260525)
+
+  get_modularity_reference <- function(Weight, B) {
+    if (dim(Weight)[1] == 1) return(0)
+    W_pos <- Weight * (Weight > 0)
+    W_neg <- Weight * (Weight < 0)
+    K_pos <- colSums(W_pos)
+    K_neg <- colSums(W_neg)
+    m_pos <- sum(K_pos)
+    m_neg <- sum(K_neg)
+    m <- m_pos + m_neg
+    if (m == 0) return(0)
+    cate <- tcrossprod(B)
+    if (m_pos == 0 & m_neg == 0) return(0)
+    if (m_pos == 0) {
+      Q_positive <- 0
+      Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
+    } else if (m_neg == 0) {
+      Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
+      Q_negative <- 0
+    } else {
+      Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
+      Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
+    }
+    m_pos / m * Q_positive - m_neg / m * Q_negative
+  }
+
+  get_n_cluster_reference <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) {
+    if (min(Sigma) > min_cluster_corr) {
+      return(list(n_cluster = 1, Qmodularity = 1))
+    }
+    if (ncol(Sigma) < 10) {
+      m <- ncol(Sigma)
+    }
+    Q <- numeric(m)
+    for (i in seq_len(m)) {
+      index <- cutree(hc, i)
+      B <- sapply(seq_len(i), function(t) as.numeric(index == t))
+      Q[i] <- get_modularity_reference(Sigma, B)
+    }
+    IND <- which(Q == max(Q))
+    if (length(IND) > 1) IND <- IND[1]
+    list(n_cluster = IND, Qmodularity = Q)
+  }
+
+  for (p in c(8, 20, 50)) {
+    Sigma <- matrix(rnorm(p * p), p, p)
+    Sigma <- (Sigma + t(Sigma)) / 2
+    diag(Sigma) <- 1
+    hc <- hclust(as.dist(1 - Sigma))
+
+    current <- get_n_cluster(hc, Sigma, m = p, min_cluster_corr = 0.999)
+    reference <- get_n_cluster_reference(hc, Sigma, m = p, min_cluster_corr = 0.999)
+
+    expect_equal(current$n_cluster, reference$n_cluster)
+    expect_equal(current$Qmodularity, reference$Qmodularity, tolerance = 1e-10)
+
+    for (k in c(1, ceiling(p / 3), p)) {
+      index <- cutree(hc, k)
+      B <- sapply(seq_len(k), function(t) as.numeric(index == t))
+      direct <- get_modularity_reference(Sigma, B)
+      components <- get_modularity_components(Sigma)
+      block_sum <- get_modularity_partition(components, index, k)
+      expect_equal(block_sum, direct, tolerance = 1e-10)
+      expect_equal(get_modularity(Sigma, B), direct, tolerance = 1e-10)
+    }
+  }
+})
+
 
 # Test get_ambiguous_colocalization function
 test_that("get_ambiguous_colocalization identifies ambiguous colocalizations correctly", {
@@ -1176,4 +1246,4 @@ test_that("get_robust_ucos with different cutoffs produces expected ordering", {
 
   # The number of remaining ucos should be monotonically non-increasing
   expect_true(all(diff(n_ucos_remaining) <= 0))
-})
+})
diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -23,7 +23,7 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost
 
 Acknowledgment: Thanks to Kate (Kathryn) Lawrence (GitHub:@kal26) for her contributions to this vignette.
 
-# 1. Loading Data using `colocboost_analysis_pipeline` function
+# 1. Loading Data using `colocboost_pipeline` function
 
 This function harmonizes the input data and prepares it for colocalization analysis. 
 
@@ -32,7 +32,7 @@ In this section, we introduce how to load the regional data required for the Col
 This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data), summary statistics 
 (sumstats, LD), or a combination of both. It runs `load_regional_univariate_data` for each individual-level dataset and `load_rss_data` for each summary statistics dataset.
 The output is a list with `individual_data` and `sumstat_data` components, where `individual_data` is a list of individual-level data and `sumstat_data` is a list of summary statistics data. 
-This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis.
+This list is then passed to the `colocboost_pipeline` function for the colocalization analysis.
 
 
 Below are the input parameters for this function for loading individual-level data:
@@ -199,9 +199,9 @@ The LD metadata file is a tab-separated file with the following columns:
 ```
 
 
-# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function
+# 2. Perform ColocBoost using `colocboost_pipeline` function
 
-In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function. 
+In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_pipeline` function.
 The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data):
 
 - **`xQTL-only mode`**: Only perform colocalization analysis on the individual-level data. Summary statistics data is not used.
@@ -226,7 +226,7 @@ Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern =
 
 Outputs:
 
-- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
+- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
 
 ```{r, colocboost-analysis, eval = FALSE}
 #### Please check the example code below ####
@@ -263,7 +263,7 @@ pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
 qc_method = "rss_qc" 
 
 # run colocboost analysis
-colocboost_results <- colocboost_analysis_pipeline(
+colocboost_results <- colocboost_pipeline(
     region_data_combined,
     maf_cutoff = maf_cutoff, 
     pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,