From 311cb3c8ab6a430bc5e9f84635a468901a9b3d22 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:24:56 -0400 Subject: [PATCH 1/2] minor fix 1. change to per-outcome maximum iterations 2. minor fix on documentation 3. minor fix on website --- R/colocboost.R | 4 +- R/colocboost_inference.R | 6 +- R/colocboost_init.R | 9 ++- R/colocboost_output.R | 7 ++- R/colocboost_update.R | 3 +- R/colocboost_workhorse.R | 59 +++++++++++-------- man/colocboost.Rd | 4 +- man/get_hierarchical_clusters.Rd | 4 +- tests/testthat/test_plot.R | 4 +- vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 8 +-- .../Summary_Statistics_Colocalization.Rmd | 6 +- 11 files changed, 70 insertions(+), 44 deletions(-) diff --git a/R/colocboost.R b/R/colocboost.R index 82bff67..3c015e7 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -39,7 +39,7 @@ #' @param effect_se Matrix of standard errors associated with the beta values #' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommended! #' -#' @param M The maximum number of gradient boosting rounds. If the number of outcomes are large, it will be automatically increased to a larger number. +#' @param M The maximum number of gradient boosting rounds for each outcome (default is 500). #' @param stop_thresh The stop criterion for overall profile loglikelihood function. #' @param tau The smooth parameter for proximity adaptive smoothing weights for the best update jk-star. #' @param learning_rate_init The minimum learning rate for updating in each iteration. @@ -140,7 +140,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data effect_se = NULL, # same as HyPrColoc, sebeta hat matrix with rowname of variable names effect_n = NULL, ###### - Model Parameters - M = NULL, # maximum iteration time + M = 500, # maximum iteration for each outcome stop_thresh = 1e-06, # stop criterion for profile_log and objective functions tau = 0.01, # kernal_tau parameter for proximity smoothing weight learning_rate_init = 0.01, # minimum step size for updating in each boosting round diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index 511a258..0648bd8 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -60,9 +60,9 @@ get_cormat <- function(X, intercepte = TRUE) { return(cr) } -#' @title Perform modularity-based hierarchical clustering +#' @title Perform modularity-based hierarchical clustering for a correlation matrix #' @description -#' This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix (LD matrix). +#' This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix. #' #' @param cormat A correlation matrix. #' @param min_cluster_corr The small correlation for the weights distributions across different iterations to be decided having only one cluster. Default is 0.8. @@ -495,7 +495,7 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { if (delta <= 0) { warning(paste( "Warning message: potential sumstat & LD mismatch may happens for outcome", outcome_idx, - ". Using logLR = CoS(profile) - max(profile). Please check our website." + ". Using logLR = CoS(profile) - max(profile). Please check our website https://statfungen.github.io/colocboost/articles/." )) } cos_profile diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 1fe75c4..767e7bf 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -263,6 +263,11 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01, } # - initial coloc_thresh coloc_thresh <- (1 - coloc_thresh) * max(sapply(1:length(cb_model), function(i) max(cb_model[[i]]$change_loglike))) + # - initial num_updates_outcome and coveraged outcome + num_updates_outcome <- lapply(1:L, function(i) 1) + coveraged_outcome <- lapply(1:L, function(i) TRUE) + names(num_updates_outcome) <- names(coveraged_outcome) <- names(cb_model) + cb_model_para <- list( "L" = L, "P" = P, @@ -289,7 +294,9 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01, "variables" = cb_data$variable.names, "focal_outcome_idx" = focal_outcome_idx, "coveraged" = TRUE, - "num_updates" = 1 + "num_updates" = 1, + "coveraged_outcome" = coveraged_outcome, + "num_updates_outcome" = num_updates_outcome ) class(cb_model_para) <- "colocboost" diff --git a/R/colocboost_output.R b/R/colocboost_output.R index b347b4a..4267891 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -977,7 +977,9 @@ get_model_info <- function(cb_obj, outcome_names = NULL) { profile_loglik <- cb_obj$cb_model_para$profile_loglike n_updates <- cb_obj$cb_model_para$num_updates + n_updates_outcome <- cb_obj$cb_model_para$num_updates_outcome model_coveraged <- cb_obj$cb_model_para$coveraged + model_coveraged_outcome <- cb_obj$cb_model_para$coveraged_outcome jk_update <- cb_obj$cb_model_para$real_update_jk if (!is.null(jk_update)){ rownames(jk_update) <- paste0("jk_star_", 1:nrow(jk_update)) @@ -987,7 +989,8 @@ get_model_info <- function(cb_obj, outcome_names = NULL) { outcome_coupled_best_update_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_single) outcome_profile_loglik <- lapply(cb_obj$cb_model, function(cb) cb$profile_loglike_each) names(outcome_proximity_obj) <- names(outcome_coupled_best_update_obj) <- - names(outcome_profile_loglik) <- outcome_names + names(outcome_profile_loglik) <- names(n_updates_outcome) <- + names(model_coveraged_outcome) <- outcome_names ll <- list( "model_coveraged" = model_coveraged, "n_updates" = n_updates, @@ -995,6 +998,8 @@ get_model_info <- function(cb_obj, outcome_names = NULL) { "outcome_profile_loglik" = outcome_profile_loglik, "outcome_proximity_obj" = outcome_proximity_obj, "outcome_coupled_best_update_obj" = outcome_coupled_best_update_obj, + "outcome_model_coveraged" = model_coveraged_outcome, + "outcome_n_updates" = n_updates_outcome, "jk_star" = jk_update ) return(ll) diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 546c7df..5ae8f3e 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -175,7 +175,7 @@ boost_KL_delta <- function(z, ld_feature, adj_dep, } -boost_check_stop <- function(cb_model, cb_model_para, pos_stop, +boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage, multi_test_max = 1) { # - check the iteration for the stop outcome (pos_stop has the same jk with original data) iter_each <- sapply(pos_stop, function(i) { @@ -189,6 +189,7 @@ boost_check_stop <- function(cb_model, cb_model_para, pos_stop, # --- stop all pos_stop outcomes cb_model_para$update_y[pos_stop] <- 0 cb_model_para$true_stop <- pos_stop + cb_model_para$no_coverage_stop <- which(stop_no_coverage==TRUE) cb_model_para$need_more <- NULL } else { # keep boosting for outcome with <= 10 iterations diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index 1269c2b..9b2bcbc 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -13,7 +13,7 @@ #' #' @noRd colocboost_workhorse <- function(cb_data, - M = NULL, + M = 500, tau = 0.01, prioritize_jkstar = TRUE, learning_rate_init = 0.01, @@ -70,11 +70,13 @@ colocboost_workhorse <- function(cb_data, coloc_thresh = coloc_thresh ) - M_single_outcome <- 200 - if (is.null(M)) { - M <- cb_model_para$L * M_single_outcome + if (M == 1){ + cb_model_para$M <- 1 + } else { + M_single_outcome <- M + M <- cb_model_para$L * M + cb_model_para$M <- M } - cb_model_para$M <- M # - if multi_test value > multi_test cutoff for some outcomes, we will not update them. if (!is.null(cb_model_para$true_stop)) { if (sum(cb_model_para$update_y == 1) == 0) { @@ -133,20 +135,23 @@ colocboost_workhorse <- function(cb_data, pos_rtr_stop <- which(sum_cor == 0) if (length(pos_rtr_stop) != 0) { cb_model_para$update_y[pos.update[pos_rtr_stop]] <- 0 + message_focal_text <- if (focal_outcome_idx %in% pos.update[pos_rtr_stop]) "including focal outcome" else NULL message(paste( - "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "), + "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "), message_focal_text, "stop since rtr < 0 or max(correlation) > 1 after", m, "iterations!", "Results for this locus are not stable, please check if mismatch between sumstat and LD!", - "See details in tutorial website." + "See details in tutorial website https://statfungen.github.io/colocboost/articles/." )) } # - stop by others - need to check iterations - stop <- rep(NA, cb_model_para$L) + stop <- stop_no_coverage <- rep(NA, cb_model_para$L) for (i in pos.update) { if (i %in% pos.update[pos_rtr_stop]) { - stop[i] <- TRUE + stop[i] <- stop_no_coverage[i] <- TRUE + cb_model_para$coveraged_outcome[i] <- FALSE } else { + stop_no_coverage[i] <- FALSE M_i <- length(cb_model[[i]]$profile_loglike_each) stop1 <- abs(cb_model[[i]]$profile_loglike_each[M_i] - cb_model[[i]]$profile_loglike_each[M_i - 1]) / cb_model[[i]]$profile_loglike_each[M_i - 1] < cb_model[[i]]$stop_thresh @@ -162,7 +167,15 @@ colocboost_workhorse <- function(cb_data, stop3 <- (M_i > M_single_outcome) # -- to ensure if some outcomes do not update previously if (length(cb_model[[i]]$profile_loglike_each) >= 2) { - stop[i] <- (stop1 | stop2 | stop3) # (stop2 | stop3) # (stop1 | stop2 | stop3) + stop[i] <- (stop1 | stop2 | stop3) + if (stop3){ + stop_no_coverage[i] <- TRUE + cb_model_para$coveraged_outcome[i] <- FALSE + warning(paste("ColocBoost gradient boosting for outcome", i, "did not coverage in", + M_single_outcome, "iterations! Please check consistency between summary statistics", + "and LD matrix. See details in tutorial website https://statfungen.github.io/colocboost/articles/.")) + + } } else { stop[i] <- FALSE } @@ -172,7 +185,7 @@ colocboost_workhorse <- function(cb_data, if (all(length(stop) == 1 & stop)) { cb_model_para$update_y <- 0 if (cb_model_para$L == 1) { - message(paste("Gradient boosting for outcome 1 converge after", m, "iterations!")) + if (!stop_no_coverage) message(paste("Gradient boosting for outcome 1 converged after", m, "iterations!")) } } else { if (all(!stop[pos.update])) { @@ -180,7 +193,7 @@ colocboost_workhorse <- function(cb_data, cb_model_para$update_y <- cb_model_para$update_y } else { pos_stop <- which(stop) # which outcome reach the stop criterion - ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, + ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, stop_no_coverage, multi_test_max = multi_test_max ) cb_model_para <- ttmp$cb_model_para @@ -190,29 +203,26 @@ colocboost_workhorse <- function(cb_data, ####### --------------------------------------------- # calculate objective function of Y for the last iteration. cb_model <- boost_obj_last(cb_data, cb_model, cb_model_para) + tt_stop <- setdiff(cb_model_para$true_stop, cb_model_para$no_coverage_stop) if (!is.null(focal_outcome_idx)) { - if (focal_outcome_idx %in% cb_model_para$true_stop) { + if (focal_outcome_idx %in% tt_stop) { message(paste( "Gradient boosting for focal outcome", focal_outcome_idx, "converged after", m, "iterations!" )) - if (length(setdiff(cb_model_para$true_stop, focal_outcome_idx)) != 0) { + if (length(setdiff(tt_stop, focal_outcome_idx)) != 0) { message(paste( - "Gradient boosting for outcome", paste(setdiff(cb_model_para$true_stop, focal_outcome_idx), collapse = ", "), + "Gradient boosting for outcome", paste(setdiff(tt_stop, focal_outcome_idx), collapse = ", "), "converged after", m, "iterations!" )) } } else { - message(paste( - "Gradient boosting for outcome", paste(cb_model_para$true_stop, collapse = ", "), - "converged after", m, "iterations!" - )) + if (length(tt_stop) != 0) + message(paste("Gradient boosting for outcome", paste(tt_stop, collapse = ", "), "converged after", m, "iterations!")) } } else { - message(paste( - "Gradient boosting for outcome", paste(cb_model_para$true_stop, collapse = ", "), - "converged after", m, "iterations!" - )) + if (length(tt_stop) != 0) + message(paste("Gradient boosting for outcome", paste(tt_stop, collapse = ", "), "converged after", m, "iterations!")) } } } @@ -284,5 +294,8 @@ cb_model_para_update <- function(cb_model, cb_model_para) { # - additional calculating profile_loglikelihood tmp <- sum(sapply(1:length(cb_model), function(i) tail(cb_model[[i]]$profile_loglike_each, n = 1))) cb_model_para$profile_loglike <- c(cb_model_para$profile_loglike, tmp) + num_updates_outcomes <- sapply(1:length(cb_model), function(i) length(cb_model[[i]]$profile_loglike_each)) + names(num_updates_outcomes) <- names(cb_model) + cb_model_para$num_updates_outcome <- num_updates_outcomes return(cb_model_para) } diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 84811ab..649437a 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -23,7 +23,7 @@ colocboost( effect_est = NULL, effect_se = NULL, effect_n = NULL, - M = NULL, + M = 500, stop_thresh = 1e-06, tau = 0.01, learning_rate_init = 0.01, @@ -105,7 +105,7 @@ The innovation: do not provide the same matrix in \code{LD} to reduce the comput \item{effect_n}{A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommended!} -\item{M}{The maximum number of gradient boosting rounds. If the number of outcomes are large, it will be automatically increased to a larger number.} +\item{M}{The maximum number of gradient boosting rounds for each outcome (default is 500).} \item{stop_thresh}{The stop criterion for overall profile loglikelihood function.} diff --git a/man/get_hierarchical_clusters.Rd b/man/get_hierarchical_clusters.Rd index 24f8a22..cd69fa9 100644 --- a/man/get_hierarchical_clusters.Rd +++ b/man/get_hierarchical_clusters.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/colocboost_inference.R \name{get_hierarchical_clusters} \alias{get_hierarchical_clusters} -\title{Perform modularity-based hierarchical clustering} +\title{Perform modularity-based hierarchical clustering for a correlation matrix} \usage{ get_hierarchical_clusters(cormat, min_cluster_corr = 0.8) } @@ -17,7 +17,7 @@ A list containing: \item{Q_modularity}{The modularity values for the identified clusters.} } \description{ -This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix (LD matrix). +This function performs a modularity-based hierarchical clustering approach to identify clusters from a correlation matrix. } \examples{ # Example usage diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R index 396d610..7c10cdf 100644 --- a/tests/testthat/test_plot.R +++ b/tests/testthat/test_plot.R @@ -133,8 +133,8 @@ test_that("colocboost_plot handles filtering options", { # Test with plot_focal_only expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_only = TRUE)), NA) - # Test with plot_focal_cos_outocme_only - expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_cos_outocme_only = TRUE)), NA) + # Test with plot_focal_cos_outcome_only + expect_error(suppressWarnings(colocboost_plot(cb_res, plot_focal_cos_outcome_only = TRUE)), NA) }) # Test colocboost_plot with visual customization options diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd index 4d42e14..4fb26e3 100644 --- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd +++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd @@ -1,8 +1,8 @@ --- -title: "ColocBoost Wrapper Pipeline" +title: "Bioinformatics Pipeline for ColocBoost" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{ColocBoost Wrapper Pipeline} + %\VignetteIndexEntry{Bioinformatics Pipeline for ColocBoost} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -15,7 +15,7 @@ knitr::opts_chunk$set( ``` -This vignette demonstrates how to use the ColocBoost wrapper pipeline to perform colocalization analysis with `colocboost`. +This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost to perform colocalization analysis with `colocboost`. - See more details about functions in the package `pecotmr` with [link](https://github.com/StatFunGen/pecotmr/tree/main) and `colocboost_pipeline` with [link](https://github.com/StatFunGen/pecotmr/blob/main/R/colocboost_pipeline.R). @@ -123,7 +123,7 @@ In this section, we perform the colocalization analysis using the `colocboost_an - **`region_data`**: The output of the `load_multitask_regional_data` function. - **`focal_trait`**: Name of the trait if performing disease-prioritized ColocBoost. - **`event_filters`**: A list of patterns for filtering events based on context names. -Example: for sQTL, list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:"). +Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`. - **`maf_cutoff`**: A scalar to remove variants with maf < maf_cutoff, default is 0.005. - **`pip_cutoff_to_skip_ind`**: A vector of cutoff values for skipping analysis based on PIP values for each context. Default is 0. - **`pip_cutoff_to_skip_sumstat`**: A vector of cutoff values for skipping analysis based on PIP values for each sumstat. Default is 0. diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 900a9e9..7824dd2 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -1,13 +1,13 @@ --- -title: "Summary Statistics Colocalization" +title: "Summary Statistics Data Colocalization" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Summary Statistics Colocalization} + %\VignetteIndexEntry{Summary Statistics Data Colocalization} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -```{r, include = FALSE} +```{r, include = FALSE}s knitr::opts_chunk$set( collapse = TRUE, comment = "#>" From c6fe2bf15481c8220d56a6c09265f8cd20511a73 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:27:55 -0400 Subject: [PATCH 2/2] Update colocboost_workhorse.R --- R/colocboost_workhorse.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index 9b2bcbc..7167ee4 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -135,7 +135,9 @@ colocboost_workhorse <- function(cb_data, pos_rtr_stop <- which(sum_cor == 0) if (length(pos_rtr_stop) != 0) { cb_model_para$update_y[pos.update[pos_rtr_stop]] <- 0 - message_focal_text <- if (focal_outcome_idx %in% pos.update[pos_rtr_stop]) "including focal outcome" else NULL + if (!is.null(focal_outcome_idx)){ + message_focal_text <- if (focal_outcome_idx %in% pos.update[pos_rtr_stop]) "including focal outcome" else NULL + } else {message_focal_text <- NULL} message(paste( "Gradient boosting for outcome", paste(pos.update[pos_rtr_stop], collapse = ", "), message_focal_text, "stop since rtr < 0 or max(correlation) > 1 after", m, "iterations!",