From 311cb3c8ab6a430bc5e9f84635a468901a9b3d22 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Mon, 21 Apr 2025 13:24:56 -0400
Subject: [PATCH 1/2] minor fix

1. change to per-outcome maximum iterations
2. minor fix on documentation
3. minor fix on website
---
 R/colocboost.R                                |  4 +-
 R/colocboost_inference.R                      |  6 +-
 R/colocboost_init.R                           |  9 ++-
 R/colocboost_output.R                         |  7 ++-
 R/colocboost_update.R                         |  3 +-
 R/colocboost_workhorse.R                      | 59 +++++++++++--------
 man/colocboost.Rd                             |  4 +-
 man/get_hierarchical_clusters.Rd              |  4 +-
 tests/testthat/test_plot.R                    |  4 +-
 vignettes/ColocBoost_Wrapper_Pipeline.Rmd     |  8 +--
 .../Summary_Statistics_Colocalization.Rmd     |  6 +-
 11 files changed, 70 insertions(+), 44 deletions(-)

diff --git a/R/colocboost.R b/R/colocboost.R
index 82bff67..3c015e7 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -39,7 +39,7 @@
 #' @param effect_se Matrix of standard errors associated with the beta values
 #' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommended!
 #'
-#' @param M The maximum number of gradient boosting rounds. If the number of outcomes are large, it will be automatically increased to a larger number.
+#' @param M The maximum number of gradient boosting rounds for each outcome (default is 500).
 #' @param stop_thresh The stop criterion for overall profile loglikelihood function.
 #' @param tau The smooth parameter for proximity adaptive smoothing weights for the best update jk-star.
 #' @param learning_rate_init The minimum learning rate for updating in each iteration.
@@ -140,7 +140,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
                        effect_se = NULL, # same as HyPrColoc, sebeta hat matrix with rowname of variable names
                        effect_n = NULL,
                        ###### - Model Parameters
-                       M = NULL, # maximum iteration time
+                       M = 500, # maximum iteration for each outcome 
                        stop_thresh = 1e-06, # stop criterion for profile_log and objective functions
                        tau = 0.01, # kernal_tau parameter for proximity smoothing weight
                        learning_rate_init = 0.01, # minimum step size for updating in each boosting round
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
index 511a258..0648bd8 100644
--- a/R/colocboost_inference.R
+++ b/R/colocboost_inference.R
@@ -60,9 +60,9 @@ get_cormat <- function(X, intercepte = TRUE) {
   return(cr)
 }
 
-#' @title Perform modularity-based hierarchical clustering
+#' @title Perform modularity-based hierarchical clustering for a correlation matrix
 #' @description
-#' This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix (LD matrix).
+#' This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix.
 #'
 #' @param cormat A correlation matrix.
 #' @param min_cluster_corr The small correlation for the weights distributions across different iterations to be decided having only one cluster. Default is 0.8.
@@ -495,7 +495,7 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) {
     if (delta <= 0) {
       warning(paste(
         "Warning message: potential sumstat & LD mismatch may happens for outcome", outcome_idx,
-        ". Using logLR  = CoS(profile) - max(profile). Please check our website."
+        ". Using logLR  = CoS(profile) - max(profile). Please check our website https://statfungen.github.io/colocboost/articles/."
       ))
     }
     cos_profile
diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 1fe75c4..767e7bf 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -263,6 +263,11 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01,
   }
   # - initial coloc_thresh
   coloc_thresh <- (1 - coloc_thresh) * max(sapply(1:length(cb_model), function(i) max(cb_model[[i]]$change_loglike)))
+  # - initial num_updates_outcome and coveraged outcome
+  num_updates_outcome <- lapply(1:L, function(i) 1)
+  coveraged_outcome <- lapply(1:L, function(i) TRUE)
+  names(num_updates_outcome) <- names(coveraged_outcome) <- names(cb_model)
+  
   cb_model_para <- list(
     "L" = L,
     "P" = P,
@@ -289,7 +294,9 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01,
     "variables" = cb_data$variable.names,
     "focal_outcome_idx" = focal_outcome_idx,
     "coveraged" = TRUE,
-    "num_updates" = 1
+    "num_updates" = 1,
+    "coveraged_outcome" = coveraged_outcome,
+    "num_updates_outcome" = num_updates_outcome
   )
   class(cb_model_para) <- "colocboost"
 
diff --git a/R/colocboost_output.R b/R/colocboost_output.R
index b347b4a..4267891 100644
--- a/R/colocboost_output.R
+++ b/R/colocboost_output.R
@@ -977,7 +977,9 @@ get_model_info <- function(cb_obj, outcome_names = NULL) {
 
   profile_loglik <- cb_obj$cb_model_para$profile_loglike
   n_updates <- cb_obj$cb_model_para$num_updates
+  n_updates_outcome <- cb_obj$cb_model_para$num_updates_outcome
   model_coveraged <- cb_obj$cb_model_para$coveraged
+  model_coveraged_outcome <- cb_obj$cb_model_para$coveraged_outcome
   jk_update <- cb_obj$cb_model_para$real_update_jk
   if (!is.null(jk_update)){
     rownames(jk_update) <- paste0("jk_star_", 1:nrow(jk_update))
@@ -987,7 +989,8 @@ get_model_info <- function(cb_obj, outcome_names = NULL) {
   outcome_coupled_best_update_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_single)
   outcome_profile_loglik <- lapply(cb_obj$cb_model, function(cb) cb$profile_loglike_each)
   names(outcome_proximity_obj) <- names(outcome_coupled_best_update_obj) <-
-    names(outcome_profile_loglik) <- outcome_names
+    names(outcome_profile_loglik) <- names(n_updates_outcome) <- 
+    names(model_coveraged_outcome) <- outcome_names
   ll <- list(
     "model_coveraged" = model_coveraged,
     "n_updates" = n_updates,
@@ -995,6 +998,8 @@ get_model_info <- function(cb_obj, outcome_names = NULL) {
     "outcome_profile_loglik" = outcome_profile_loglik,
     "outcome_proximity_obj" = outcome_proximity_obj,
     "outcome_coupled_best_update_obj" = outcome_coupled_best_update_obj,
+    "outcome_model_coveraged" = model_coveraged_outcome,
+    "outcome_n_updates" = n_updates_outcome,
     "jk_star" = jk_update
   )
   return(ll)
diff --git a/R/colocboost_update.R b/R/colocboost_update.R
index 546c7df..5ae8f3e 100644
--- a/R/colocboost_update.R
+++ b/R/colocboost_update.R
@@ -175,7 +175,7 @@ boost_KL_delta <- function(z, ld_feature, adj_dep,
 }
 
 
-boost_check_stop <- function(cb_model, cb_model_para, pos_stop,
+boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage,
                              multi_test_max = 1) {
   # - check the iteration for the stop outcome (pos_stop has the same jk with original data)
   iter_each <- sapply(pos_stop, function(i) {
@@ -189,6 +189,7 @@ boost_check_stop <- function(cb_model, cb_model_para, pos_stop,
     # --- stop all pos_stop outcomes
     cb_model_para$update_y[pos_stop] <- 0
     cb_model_para$true_stop <- pos_stop
+    cb_model_para$no_coverage_stop <- which(stop_no_coverage==TRUE)
     cb_model_para$need_more <- NULL
   } else {
     # keep boosting for outcome with <= 10 iterations
diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R
index 1269c2b..9b2bcbc 100644
--- a/R/colocboost_workhorse.R
+++ b/R/colocboost_workhorse.R
@@ -13,7 +13,7 @@
 #'
 #' @noRd
 colocboost_workhorse <- function(cb_data,
-                                 M = NULL,
+                                 M = 500,
                                  tau = 0.01,
                                  prioritize_jkstar = TRUE,
                                  learning_rate_init = 0.01,
@@ -70,11 +70,13 @@ colocboost_workhorse <- function(cb_data,
     coloc_thresh = coloc_thresh
   )
   
-  M_single_outcome <- 200      
-  if (is.null(M)) {
-    M <- cb_model_para$L * M_single_outcome
+  if (M == 1){
+    cb_model_para$M <- 1
+  } else {
+    M_single_outcome <- M
+    M <- cb_model_para$L * M
+    cb_model_para$M <- M
   }
-  cb_model_para$M <- M
   # - if multi_test value > multi_test cutoff for some outcomes, we will not update them.
   if (!is.null(cb_model_para$true_stop)) {
     if (sum(cb_model_para$update_y == 1) == 0) {
@@ -133,20 +135,23 @@ colocboost_workhorse <- function(cb_data,
         pos_rtr_stop <- which(sum_cor == 0)
         if (length(pos_rtr_stop) != 0) {
           cb_model_para$update_y[pos.update[pos_rtr_stop]] <- 0
+          message_focal_text <- if (focal_outcome_idx %in% pos.update[pos_rtr_stop]) "including focal outcome" else NULL
           message(paste(
-            "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "),
+            "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "), message_focal_text,
             "stop since rtr < 0 or max(correlation) > 1 after", m, "iterations!",
             "Results for this locus are not stable, please check if mismatch between sumstat and LD!",
-            "See details in tutorial website."
+            "See details in tutorial website https://statfungen.github.io/colocboost/articles/."
           ))
         }
 
         # - stop by others - need to check iterations
-        stop <- rep(NA, cb_model_para$L)
+        stop <- stop_no_coverage <- rep(NA, cb_model_para$L)
         for (i in pos.update) {
           if (i %in% pos.update[pos_rtr_stop]) {
-            stop[i] <- TRUE
+            stop[i] <- stop_no_coverage[i] <- TRUE
+            cb_model_para$coveraged_outcome[i] <- FALSE
           } else {
+            stop_no_coverage[i] <- FALSE
             M_i <- length(cb_model[[i]]$profile_loglike_each)
             stop1 <- abs(cb_model[[i]]$profile_loglike_each[M_i] - cb_model[[i]]$profile_loglike_each[M_i - 1]) /
               cb_model[[i]]$profile_loglike_each[M_i - 1] < cb_model[[i]]$stop_thresh
@@ -162,7 +167,15 @@ colocboost_workhorse <- function(cb_data,
             stop3 <- (M_i > M_single_outcome)
             # -- to ensure if some outcomes do not update previously
             if (length(cb_model[[i]]$profile_loglike_each) >= 2) {
-              stop[i] <- (stop1 | stop2 | stop3) # (stop2 | stop3) # (stop1 | stop2 | stop3)
+              stop[i] <- (stop1 | stop2 | stop3)
+              if (stop3){
+                stop_no_coverage[i] <- TRUE
+                cb_model_para$coveraged_outcome[i] <- FALSE
+                warning(paste("ColocBoost gradient boosting for outcome", i, "did not coverage in",
+                              M_single_outcome, "iterations! Please check consistency between summary statistics",
+                              "and LD matrix. See details in tutorial website https://statfungen.github.io/colocboost/articles/."))
+                
+              }
             } else {
               stop[i] <- FALSE
             }
@@ -172,7 +185,7 @@ colocboost_workhorse <- function(cb_data,
         if (all(length(stop) == 1 & stop)) {
           cb_model_para$update_y <- 0
           if (cb_model_para$L == 1) {
-            message(paste("Gradient boosting for outcome 1 converge after", m, "iterations!"))
+            if (!stop_no_coverage) message(paste("Gradient boosting for outcome 1 converged after", m, "iterations!"))
           }
         } else {
           if (all(!stop[pos.update])) {
@@ -180,7 +193,7 @@ colocboost_workhorse <- function(cb_data,
             cb_model_para$update_y <- cb_model_para$update_y
           } else {
             pos_stop <- which(stop) # which outcome reach the stop criterion
-            ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop,
+            ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, stop_no_coverage,
               multi_test_max = multi_test_max
             )
             cb_model_para <- ttmp$cb_model_para
@@ -190,29 +203,26 @@ colocboost_workhorse <- function(cb_data,
               ####### ---------------------------------------------
               # calculate objective function of Y for the last iteration.
               cb_model <- boost_obj_last(cb_data, cb_model, cb_model_para)
+              tt_stop <- setdiff(cb_model_para$true_stop, cb_model_para$no_coverage_stop)
               if (!is.null(focal_outcome_idx)) {
-                if (focal_outcome_idx %in% cb_model_para$true_stop) {
+                if (focal_outcome_idx %in% tt_stop) {
                   message(paste(
                     "Gradient boosting for focal outcome", focal_outcome_idx,
                     "converged after", m, "iterations!"
                   ))
-                  if (length(setdiff(cb_model_para$true_stop, focal_outcome_idx)) != 0) {
+                  if (length(setdiff(tt_stop, focal_outcome_idx)) != 0) {
                     message(paste(
-                      "Gradient boosting for outcome", paste(setdiff(cb_model_para$true_stop, focal_outcome_idx), collapse = ", "),
+                      "Gradient boosting for outcome", paste(setdiff(tt_stop, focal_outcome_idx), collapse = ", "),
                       "converged after", m, "iterations!"
                     ))
                   }
                 } else {
-                  message(paste(
-                    "Gradient boosting for outcome", paste(cb_model_para$true_stop, collapse = ", "),
-                    "converged after", m, "iterations!"
-                  ))
+                  if (length(tt_stop) != 0)
+                    message(paste("Gradient boosting for outcome", paste(tt_stop, collapse = ", "), "converged after", m, "iterations!"))
                 }
               } else {
-                message(paste(
-                  "Gradient boosting for outcome", paste(cb_model_para$true_stop, collapse = ", "),
-                  "converged after", m, "iterations!"
-                ))
+                if (length(tt_stop) != 0)
+                  message(paste("Gradient boosting for outcome", paste(tt_stop, collapse = ", "), "converged after", m, "iterations!"))
               }
             }
           }
@@ -284,5 +294,8 @@ cb_model_para_update <- function(cb_model, cb_model_para) {
   # - additional calculating profile_loglikelihood
   tmp <- sum(sapply(1:length(cb_model), function(i) tail(cb_model[[i]]$profile_loglike_each, n = 1)))
   cb_model_para$profile_loglike <- c(cb_model_para$profile_loglike, tmp)
+  num_updates_outcomes <- sapply(1:length(cb_model), function(i) length(cb_model[[i]]$profile_loglike_each))
+  names(num_updates_outcomes) <- names(cb_model)
+  cb_model_para$num_updates_outcome <- num_updates_outcomes
   return(cb_model_para)
 }
diff --git a/man/colocboost.Rd b/man/colocboost.Rd
index 84811ab..649437a 100644
--- a/man/colocboost.Rd
+++ b/man/colocboost.Rd
@@ -23,7 +23,7 @@ colocboost(
   effect_est = NULL,
   effect_se = NULL,
   effect_n = NULL,
-  M = NULL,
+  M = 500,
   stop_thresh = 1e-06,
   tau = 0.01,
   learning_rate_init = 0.01,
@@ -105,7 +105,7 @@ The innovation: do not provide the same matrix in \code{LD} to reduce the comput
 
 \item{effect_n}{A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommended!}
 
-\item{M}{The maximum number of gradient boosting rounds. If the number of outcomes are large, it will be automatically increased to a larger number.}
+\item{M}{The maximum number of gradient boosting rounds for each outcome (default is 500).}
 
 \item{stop_thresh}{The stop criterion for overall profile loglikelihood function.}
 
diff --git a/man/get_hierarchical_clusters.Rd b/man/get_hierarchical_clusters.Rd
index 24f8a22..cd69fa9 100644
--- a/man/get_hierarchical_clusters.Rd
+++ b/man/get_hierarchical_clusters.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/colocboost_inference.R
 \name{get_hierarchical_clusters}
 \alias{get_hierarchical_clusters}
-\title{Perform modularity-based hierarchical clustering}
+\title{Perform modularity-based hierarchical clustering for a correlation matrix}
 \usage{
 get_hierarchical_clusters(cormat, min_cluster_corr = 0.8)
 }
@@ -17,7 +17,7 @@ A list containing:
 \item{Q_modularity}{The modularity values for the identified clusters.}
 }
 \description{
-This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix (LD matrix).
+This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix.
 }
 \examples{
 # Example usage
diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R
index 396d610..7c10cdf 100644
--- a/tests/testthat/test_plot.R
+++ b/tests/testthat/test_plot.R
@@ -133,8 +133,8 @@ test_that("colocboost_plot handles filtering options", {
   # Test with plot_focal_only
   expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_only = TRUE)), NA)
   
-  # Test with plot_focal_cos_outocme_only
-  expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_cos_outocme_only = TRUE)), NA)
+  # Test with plot_focal_cos_outcome_only
+  expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_cos_outcome_only = TRUE)), NA)
 })
 
 # Test colocboost_plot with visual customization options
diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
index 4d42e14..4fb26e3 100644
--- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
+++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -1,8 +1,8 @@
 ---
-title: "ColocBoost Wrapper Pipeline"
+title: "Bioinformatics Pipeline for ColocBoost"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{ColocBoost Wrapper Pipeline}
+  %\VignetteIndexEntry{Bioinformatics Pipeline for ColocBoost}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
@@ -15,7 +15,7 @@ knitr::opts_chunk$set(
 ```
 
 
-This vignette demonstrates how to use the ColocBoost wrapper pipeline to perform colocalization analysis with `colocboost`. 
+This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost to perform colocalization analysis with `colocboost`. 
 
 - See more details about functions in the package `pecotmr` with [link](https://github.com/StatFunGen/pecotmr/tree/main) and 
 `colocboost_pipeline` with [link](https://github.com/StatFunGen/pecotmr/blob/main/R/colocboost_pipeline.R). 
@@ -123,7 +123,7 @@ In this section, we perform the colocalization analysis using the `colocboost_an
 - **`region_data`**: The output of the `load_multitask_regional_data` function.
 - **`focal_trait`**: Name of the trait if performing disease-prioritized ColocBoost.
 - **`event_filters`**: A list of patterns for filtering events based on context names. 
-Example: for sQTL, list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:").
+Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`.
 - **`maf_cutoff`**: A scalar to remove variants with maf < maf_cutoff, default is 0.005.
 - **`pip_cutoff_to_skip_ind`**: A vector of cutoff values for skipping analysis based on PIP values for each context. Default is 0.
 - **`pip_cutoff_to_skip_sumstat`**: A vector of cutoff values for skipping analysis based on PIP values for each sumstat. Default is 0.
diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd
index 900a9e9..7824dd2 100644
--- a/vignettes/Summary_Statistics_Colocalization.Rmd
+++ b/vignettes/Summary_Statistics_Colocalization.Rmd
@@ -1,13 +1,13 @@
 ---
-title: "Summary Statistics Colocalization"
+title: "Summary Statistics Data Colocalization"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{Summary Statistics Colocalization}
+  %\VignetteIndexEntry{Summary Statistics Data Colocalization}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
 
-```{r, include = FALSE}
+```{r, include = FALSE}s
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>"

From c6fe2bf15481c8220d56a6c09265f8cd20511a73 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Mon, 21 Apr 2025 13:27:55 -0400
Subject: [PATCH 2/2] Update colocboost_workhorse.R

---
 R/colocboost_workhorse.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R
index 9b2bcbc..7167ee4 100644
--- a/R/colocboost_workhorse.R
+++ b/R/colocboost_workhorse.R
@@ -135,7 +135,9 @@ colocboost_workhorse <- function(cb_data,
         pos_rtr_stop <- which(sum_cor == 0)
         if (length(pos_rtr_stop) != 0) {
           cb_model_para$update_y[pos.update[pos_rtr_stop]] <- 0
-          message_focal_text <- if (focal_outcome_idx %in% pos.update[pos_rtr_stop]) "including focal outcome" else NULL
+          if (!is.null(focal_outcome_idx)){
+            message_focal_text <- if (focal_outcome_idx %in% pos.update[pos_rtr_stop]) "including focal outcome" else NULL
+          } else {message_focal_text <- NULL}
           message(paste(
             "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "), message_focal_text,
             "stop since rtr < 0 or max(correlation) > 1 after", m, "iterations!",