StatFunGen · xueweic · Apr 21, 2025 · Apr 21, 2025 · Apr 21, 2025
diff --git a/R/colocboost.R b/R/colocboost.R
@@ -39,7 +39,7 @@
 #' @param effect_se Matrix of standard errors associated with the beta values
 #' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommended!
 #'
-#' @param M The maximum number of gradient boosting rounds. If the number of outcomes are large, it will be automatically increased to a larger number.
+#' @param M The maximum number of gradient boosting rounds for each outcome (default is 500).
 #' @param stop_thresh The stop criterion for overall profile loglikelihood function.
 #' @param tau The smooth parameter for proximity adaptive smoothing weights for the best update jk-star.
 #' @param learning_rate_init The minimum learning rate for updating in each iteration.
@@ -140,7 +140,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
                        effect_se = NULL, # same as HyPrColoc, sebeta hat matrix with rowname of variable names
                        effect_n = NULL,
                        ###### - Model Parameters
-                       M = NULL, # maximum iteration time
+                       M = 500, # maximum iteration for each outcome 
                        stop_thresh = 1e-06, # stop criterion for profile_log and objective functions
                        tau = 0.01, # kernal_tau parameter for proximity smoothing weight
                        learning_rate_init = 0.01, # minimum step size for updating in each boosting round

diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
@@ -60,9 +60,9 @@ get_cormat <- function(X, intercepte = TRUE) {
   return(cr)
 }
 
-#' @title Perform modularity-based hierarchical clustering
+#' @title Perform modularity-based hierarchical clustering for a correlation matrix
 #' @description
-#' This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix (LD matrix).
+#' This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix.
 #'
 #' @param cormat A correlation matrix.
 #' @param min_cluster_corr The small correlation for the weights distributions across different iterations to be decided having only one cluster. Default is 0.8.
@@ -495,7 +495,7 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) {
     if (delta <= 0) {
       warning(paste(
         "Warning message: potential sumstat & LD mismatch may happens for outcome", outcome_idx,
-        ". Using logLR  = CoS(profile) - max(profile). Please check our website."
+        ". Using logLR  = CoS(profile) - max(profile). Please check our website https://statfungen.github.io/colocboost/articles/."
       ))
     }
     cos_profile

diff --git a/R/colocboost_init.R b/R/colocboost_init.R
@@ -263,6 +263,11 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01,
   }
   # - initial coloc_thresh
   coloc_thresh <- (1 - coloc_thresh) * max(sapply(1:length(cb_model), function(i) max(cb_model[[i]]$change_loglike)))
+  # - initial num_updates_outcome and coveraged outcome
+  num_updates_outcome <- lapply(1:L, function(i) 1)
+  coveraged_outcome <- lapply(1:L, function(i) TRUE)
+  names(num_updates_outcome) <- names(coveraged_outcome) <- names(cb_model)
+
   cb_model_para <- list(
     "L" = L,
     "P" = P,
@@ -289,7 +294,9 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01,
     "variables" = cb_data$variable.names,
     "focal_outcome_idx" = focal_outcome_idx,
     "coveraged" = TRUE,
-    "num_updates" = 1
+    "num_updates" = 1,
+    "coveraged_outcome" = coveraged_outcome,
+    "num_updates_outcome" = num_updates_outcome
   )
   class(cb_model_para) <- "colocboost"
 

diff --git a/R/colocboost_output.R b/R/colocboost_output.R
@@ -977,7 +977,9 @@ get_model_info <- function(cb_obj, outcome_names = NULL) {
 
   profile_loglik <- cb_obj$cb_model_para$profile_loglike
   n_updates <- cb_obj$cb_model_para$num_updates
+  n_updates_outcome <- cb_obj$cb_model_para$num_updates_outcome
   model_coveraged <- cb_obj$cb_model_para$coveraged
+  model_coveraged_outcome <- cb_obj$cb_model_para$coveraged_outcome
   jk_update <- cb_obj$cb_model_para$real_update_jk
   if (!is.null(jk_update)){
     rownames(jk_update) <- paste0("jk_star_", 1:nrow(jk_update))
@@ -987,14 +989,17 @@ get_model_info <- function(cb_obj, outcome_names = NULL) {
   outcome_coupled_best_update_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_single)
   outcome_profile_loglik <- lapply(cb_obj$cb_model, function(cb) cb$profile_loglike_each)
   names(outcome_proximity_obj) <- names(outcome_coupled_best_update_obj) <-
-    names(outcome_profile_loglik) <- outcome_names
+    names(outcome_profile_loglik) <- names(n_updates_outcome) <- 
+    names(model_coveraged_outcome) <- outcome_names
   ll <- list(
     "model_coveraged" = model_coveraged,
     "n_updates" = n_updates,
     "profile_loglik" = profile_loglik,
     "outcome_profile_loglik" = outcome_profile_loglik,
     "outcome_proximity_obj" = outcome_proximity_obj,
     "outcome_coupled_best_update_obj" = outcome_coupled_best_update_obj,
+    "outcome_model_coveraged" = model_coveraged_outcome,
+    "outcome_n_updates" = n_updates_outcome,
     "jk_star" = jk_update
   )
   return(ll)

diff --git a/R/colocboost_update.R b/R/colocboost_update.R
@@ -175,7 +175,7 @@ boost_KL_delta <- function(z, ld_feature, adj_dep,
 }
 
 
-boost_check_stop <- function(cb_model, cb_model_para, pos_stop,
+boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage,
                              multi_test_max = 1) {
   # - check the iteration for the stop outcome (pos_stop has the same jk with original data)
   iter_each <- sapply(pos_stop, function(i) {
@@ -189,6 +189,7 @@ boost_check_stop <- function(cb_model, cb_model_para, pos_stop,
     # --- stop all pos_stop outcomes
     cb_model_para$update_y[pos_stop] <- 0
     cb_model_para$true_stop <- pos_stop
+    cb_model_para$no_coverage_stop <- which(stop_no_coverage==TRUE)
     cb_model_para$need_more <- NULL
   } else {
     # keep boosting for outcome with <= 10 iterations

diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R
@@ -13,7 +13,7 @@
 #'
 #' @noRd
 colocboost_workhorse <- function(cb_data,
-                                 M = NULL,
+                                 M = 500,
                                  tau = 0.01,
                                  prioritize_jkstar = TRUE,
                                  learning_rate_init = 0.01,
@@ -70,11 +70,13 @@ colocboost_workhorse <- function(cb_data,
     coloc_thresh = coloc_thresh
   )
 
-  M_single_outcome <- 200      
-  if (is.null(M)) {
-    M <- cb_model_para$L * M_single_outcome
+  if (M == 1){
+    cb_model_para$M <- 1
+  } else {
+    M_single_outcome <- M
+    M <- cb_model_para$L * M
+    cb_model_para$M <- M
   }
-  cb_model_para$M <- M
   # - if multi_test value > multi_test cutoff for some outcomes, we will not update them.
   if (!is.null(cb_model_para$true_stop)) {
     if (sum(cb_model_para$update_y == 1) == 0) {
@@ -133,20 +135,25 @@ colocboost_workhorse <- function(cb_data,
         pos_rtr_stop <- which(sum_cor == 0)
         if (length(pos_rtr_stop) != 0) {
           cb_model_para$update_y[pos.update[pos_rtr_stop]] <- 0
+          if (!is.null(focal_outcome_idx)){
+            message_focal_text <- if (focal_outcome_idx %in% pos.update[pos_rtr_stop]) "including focal outcome" else NULL
+          } else {message_focal_text <- NULL}
           message(paste(
-            "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "),
+            "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "), message_focal_text,
             "stop since rtr < 0 or max(correlation) > 1 after", m, "iterations!",
             "Results for this locus are not stable, please check if mismatch between sumstat and LD!",
-            "See details in tutorial website."
+            "See details in tutorial website https://statfungen.github.io/colocboost/articles/."
           ))
         }
 
         # - stop by others - need to check iterations
-        stop <- rep(NA, cb_model_para$L)
+        stop <- stop_no_coverage <- rep(NA, cb_model_para$L)
         for (i in pos.update) {
           if (i %in% pos.update[pos_rtr_stop]) {
-            stop[i] <- TRUE
+            stop[i] <- stop_no_coverage[i] <- TRUE
+            cb_model_para$coveraged_outcome[i] <- FALSE
           } else {
+            stop_no_coverage[i] <- FALSE
             M_i <- length(cb_model[[i]]$profile_loglike_each)
             stop1 <- abs(cb_model[[i]]$profile_loglike_each[M_i] - cb_model[[i]]$profile_loglike_each[M_i - 1]) /
               cb_model[[i]]$profile_loglike_each[M_i - 1] < cb_model[[i]]$stop_thresh
@@ -162,7 +169,15 @@ colocboost_workhorse <- function(cb_data,
             stop3 <- (M_i > M_single_outcome)
             # -- to ensure if some outcomes do not update previously
             if (length(cb_model[[i]]$profile_loglike_each) >= 2) {
-              stop[i] <- (stop1 | stop2 | stop3) # (stop2 | stop3) # (stop1 | stop2 | stop3)
+              stop[i] <- (stop1 | stop2 | stop3)
+              if (stop3){
+                stop_no_coverage[i] <- TRUE
+                cb_model_para$coveraged_outcome[i] <- FALSE
+                warning(paste("ColocBoost gradient boosting for outcome", i, "did not coverage in",
+                              M_single_outcome, "iterations! Please check consistency between summary statistics",
+                              "and LD matrix. See details in tutorial website https://statfungen.github.io/colocboost/articles/."))
+
+              }
             } else {
               stop[i] <- FALSE
             }
@@ -172,15 +187,15 @@ colocboost_workhorse <- function(cb_data,
         if (all(length(stop) == 1 & stop)) {
           cb_model_para$update_y <- 0
           if (cb_model_para$L == 1) {
-            message(paste("Gradient boosting for outcome 1 converge after", m, "iterations!"))
+            if (!stop_no_coverage) message(paste("Gradient boosting for outcome 1 converged after", m, "iterations!"))
           }
         } else {
           if (all(!stop[pos.update])) {
             # -- all Y do not stop, need more iterations.
             cb_model_para$update_y <- cb_model_para$update_y
           } else {
             pos_stop <- which(stop) # which outcome reach the stop criterion
-            ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop,
+            ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, stop_no_coverage,
               multi_test_max = multi_test_max
             )
             cb_model_para <- ttmp$cb_model_para
@@ -190,29 +205,26 @@ colocboost_workhorse <- function(cb_data,
               ####### ---------------------------------------------
               # calculate objective function of Y for the last iteration.
               cb_model <- boost_obj_last(cb_data, cb_model, cb_model_para)
+              tt_stop <- setdiff(cb_model_para$true_stop, cb_model_para$no_coverage_stop)
               if (!is.null(focal_outcome_idx)) {
-                if (focal_outcome_idx %in% cb_model_para$true_stop) {
+                if (focal_outcome_idx %in% tt_stop) {
                   message(paste(
                     "Gradient boosting for focal outcome", focal_outcome_idx,
                     "converged after", m, "iterations!"
                   ))
-                  if (length(setdiff(cb_model_para$true_stop, focal_outcome_idx)) != 0) {
+                  if (length(setdiff(tt_stop, focal_outcome_idx)) != 0) {
                     message(paste(
-                      "Gradient boosting for outcome", paste(setdiff(cb_model_para$true_stop, focal_outcome_idx), collapse = ", "),
+                      "Gradient boosting for outcome", paste(setdiff(tt_stop, focal_outcome_idx), collapse = ", "),
                       "converged after", m, "iterations!"
                     ))
                   }
                 } else {
-                  message(paste(
-                    "Gradient boosting for outcome", paste(cb_model_para$true_stop, collapse = ", "),
-                    "converged after", m, "iterations!"
-                  ))
+                  if (length(tt_stop) != 0)
+                    message(paste("Gradient boosting for outcome", paste(tt_stop, collapse = ", "), "converged after", m, "iterations!"))
                 }
               } else {
-                message(paste(
-                  "Gradient boosting for outcome", paste(cb_model_para$true_stop, collapse = ", "),
-                  "converged after", m, "iterations!"
-                ))
+                if (length(tt_stop) != 0)
+                  message(paste("Gradient boosting for outcome", paste(tt_stop, collapse = ", "), "converged after", m, "iterations!"))
               }
             }
           }
@@ -284,5 +296,8 @@ cb_model_para_update <- function(cb_model, cb_model_para) {
   # - additional calculating profile_loglikelihood
   tmp <- sum(sapply(1:length(cb_model), function(i) tail(cb_model[[i]]$profile_loglike_each, n = 1)))
   cb_model_para$profile_loglike <- c(cb_model_para$profile_loglike, tmp)
+  num_updates_outcomes <- sapply(1:length(cb_model), function(i) length(cb_model[[i]]$profile_loglike_each))
+  names(num_updates_outcomes) <- names(cb_model)
+  cb_model_para$num_updates_outcome <- num_updates_outcomes
   return(cb_model_para)
 }
diff --git a/man/colocboost.Rd b/man/colocboost.Rd
diff --git a/man/get_hierarchical_clusters.Rd b/man/get_hierarchical_clusters.Rd
diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R
@@ -133,8 +133,8 @@ test_that("colocboost_plot handles filtering options", {
   # Test with plot_focal_only
   expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_only = TRUE)), NA)
 
-  # Test with plot_focal_cos_outocme_only
-  expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_cos_outocme_only = TRUE)), NA)
+  # Test with plot_focal_cos_outcome_only
+  expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_cos_outcome_only = TRUE)), NA)
 })
 
 # Test colocboost_plot with visual customization options

diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -1,8 +1,8 @@
 ---
-title: "ColocBoost Wrapper Pipeline"
+title: "Bioinformatics Pipeline for ColocBoost"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{ColocBoost Wrapper Pipeline}
+  %\VignetteIndexEntry{Bioinformatics Pipeline for ColocBoost}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
@@ -15,7 +15,7 @@ knitr::opts_chunk$set(
 ```
 
 
-This vignette demonstrates how to use the ColocBoost wrapper pipeline to perform colocalization analysis with `colocboost`. 
+This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost to perform colocalization analysis with `colocboost`. 
 
 - See more details about functions in the package `pecotmr` with [link](https://github.com/StatFunGen/pecotmr/tree/main) and 
 `colocboost_pipeline` with [link](https://github.com/StatFunGen/pecotmr/blob/main/R/colocboost_pipeline.R). 
@@ -123,7 +123,7 @@ In this section, we perform the colocalization analysis using the `colocboost_an
 - **`region_data`**: The output of the `load_multitask_regional_data` function.
 - **`focal_trait`**: Name of the trait if performing disease-prioritized ColocBoost.
 - **`event_filters`**: A list of patterns for filtering events based on context names. 
-Example: for sQTL, list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:").
+Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`.
 - **`maf_cutoff`**: A scalar to remove variants with maf < maf_cutoff, default is 0.005.
 - **`pip_cutoff_to_skip_ind`**: A vector of cutoff values for skipping analysis based on PIP values for each context. Default is 0.
 - **`pip_cutoff_to_skip_sumstat`**: A vector of cutoff values for skipping analysis based on PIP values for each sumstat. Default is 0.

diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd
@@ -1,13 +1,13 @@
 ---
-title: "Summary Statistics Colocalization"
+title: "Summary Statistics Data Colocalization"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{Summary Statistics Colocalization}
+  %\VignetteIndexEntry{Summary Statistics Data Colocalization}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
 
-```{r, include = FALSE}
+```{r, include = FALSE}s
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>"