diff --git a/DESCRIPTION b/DESCRIPTION index a5b1bf1..72ab83e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,7 +7,6 @@ Authors@R: c( person(given = "Xuewei", family = "Cao", email = "xc2270@cumc.columbia.edu", role = c("cre", "aut")), person(given = "Haochen", family = "Sun", email = "hs3393@cumc.columbia.edu", role = "aut"), person(given = "Ru", family = "Feng", email = "rf2872@cumc.columbia.edu", role = "aut"), - person(given = "Rahul", family = "Mazumder", email = "rahulmaz@mit.edu", role = "aut"), person(given = "Daniel", family = "Nachun", role = "aut"), person(given = "Kushal", family = "Dey", email = "deyk@mskcc.org", role = c("aut")), person(given = "Gao", family = "Wang", email = "gw2411@cumc.columbia.edu", role = c("aut")) diff --git a/LICENSE b/LICENSE index 412e5b6..48e8b25 100644 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,2 @@ YEAR: 2025 -COPYRIGHT HOLDER: StatFunGen authors +COPYRIGHT HOLDER: StatFunGen Lab at Columbia University diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index e3067e5..4fc64e3 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -242,6 +242,7 @@ colocboost_assemble <- function(cb_obj, if (!is.null(cb_output$ucos_details$ucos)) { cb_output$vpa <- apply(do.call(cbind, cb_output$ucos_details$ucos_weight), 1, function(w0) 1 - prod(1 - w0)) names(cb_output$vpa) <- data_info$variables + class(cb_output) <- "colocboost" cb_output$ucos_summary <- get_ucos_summary(cb_output) } else { tmp <- list("vpa" = NULL, "ucos_summary" = NULL) diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index c8335fa..fd893c6 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -244,12 +244,12 @@ colocboost_plot <- function(cb_output, y = "log10p", x0 <- intersect(args$x, cs) y1 <- args$y[match(x0, args$x)] points(x0, y1, - pch = 4, col = adjustcolor(legend_text$col[i.uncoloc], alpha.f = 0.3), + pch = 4, col = adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.3), cex = 1.5, lwd = 1.5 ) texts <- c(texts, uncoloc$cos_uncoloc_texts[i.cs]) - shape_col <- c(shape_col, adjustcolor(legend_text$col[i.uncoloc], alpha.f = 1)) - texts_col <- c(texts_col, adjustcolor(legend_text$col[i.uncoloc], alpha.f = 0.8)) + shape_col <- c(shape_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 1)) + texts_col <- c(texts_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.8)) } } if (length(texts) == 0) { diff --git a/_pkgdown.yml b/_pkgdown.yml index b4618f4..eccb370 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -5,8 +5,6 @@ template: navbar: left: - - text: "Home" - href: index.html - text: "News" href: articles/announcements.html - text: "Installation" @@ -24,16 +22,16 @@ articles: contents: - Input_Data_Format - Individual_Level_Colocalization - - Summary_Level_Colocalization + - Summary_Statistics_Colocalization - Disease_Prioritized_Colocalization - - ColocBoost_tutorial_advance - - ColocBoost_tutorial_strong_colocalization - - ColocBoost_tutorial_diagnostic + - Interpret_ColocBoost_Output + - Visualization_ColocBoost_Output - title: internal contents: - announcements - installation + - ColocBoost_tutorial_diagnostic reference: - title: "Example Data" diff --git a/man/figures/missing_representation.png b/man/figures/missing_representation.png new file mode 100644 index 0000000..ad21a5c Binary files /dev/null and b/man/figures/missing_representation.png differ diff --git a/vignettes/ColocBoost_tutorial_advance.Rmd b/vignettes/ColocBoost_tutorial_advance.Rmd deleted file mode 100644 index 731c477..0000000 --- a/vignettes/ColocBoost_tutorial_advance.Rmd +++ /dev/null @@ -1,74 +0,0 @@ ---- -title: "ColocBoost Tutorial (Advanced Usage)" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{ColocBoost Tutortial (Advanced Usage)} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup} -library(colocboost) -``` - -## Leveraging a single genotype or LD matrix across multiple phenotypes or summary statistics - -In large-scale genetic studies, particularly those involving colocalization analysis of gene expression across different tissues or cell types (such as GTEx or ROSMAP), it is common to encounter scenarios where a single genotype matrix applies to multiple phenotypes. Similarly, for summary statistics analyses involving multiple complex diseases (like those from UK Biobank), researchers often use a single LD matrix across various datasets. Traditional methods/usage that require duplicating these matrices for each phenotype or summary statistics not only increase computational load but also significantly waste memory resources. - -To address these inefficiencies, ColocBoost offers a streamlined option that supports using a single genotype matrix alongside a matrix of phenotypes or a single LD matrix with a list of summary statistics dataframes. This approach leverages the consistency across genetic structures to reduce redundancy and enhance computational efficiency. - - -**Example of using individual-level data** - -Consider a scenario where all phenotypes share the same genotype matrix, as demonstrated with the `Ind_5traits` dataset. Instead of loading multiple copies of the genotype matrix, we use the first genotype matrix for all phenotypes and covert list of phenotype to be a single matrix. - -```{r oneGeno} -data("Ind_5traits") -X <- Ind_5traits$X[[1]] -Y <- do.call(cbind, Ind_5traits$Y) -# res <- colocboost(X = X, Y = Y) -# res$cos_summary -``` - -**Example of using summary statistics** - -For summary statistics, similar optimization is applied using `Sumstat_5traits` dataset where all statistical summaries share the same LD matrix. - -```{r oneLD} -data("Sumstat_5traits") -sumstat <- Sumstat_5traits$sumstat -LD <- get_cormat(Ind_5traits$X[[1]]) -# res <- colocboost(sumstat = sumstat, LD = LD) -# res$cos_summary -``` - -**Example of combining individual-level data and summary statistics** - -Consider a scenario where several individual-level phenotypes share a common genotype matrix, while multiple summary statistics share the same LD matrix. Our goal is to identify the colocalization across this mixture of individual-level phenotypes and multiple summary statistics. - - -```{r oneMixture} -data("Ind_5traits") -X <- Ind_5traits$X[[1]] -Y <- do.call(cbind, Ind_5traits$Y[1:3]) -data("Sumstat_5traits") -sumstat <- Sumstat_5traits$sumstat[4:5] -# LD <- get_cormat(Ind_5traits$X[[1]]) -# res <- colocboost(X = X, Y = Y, sumstat = sumstat, LD = LD) -# res$cos_summary -``` - - -## Few genotype matrices or few LD matrices than phenotypes and summary statistics (dict_YX and dict_sumstat) - -## Different phenotypes has different SNPs. - - - diff --git a/vignettes/ColocBoost_tutorial_strong_colocalization.Rmd b/vignettes/ColocBoost_tutorial_strong_colocalization.Rmd deleted file mode 100644 index d34d360..0000000 --- a/vignettes/ColocBoost_tutorial_strong_colocalization.Rmd +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: "ColocBoost Tutortial (Summary of ColocBoost results)" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{ColocBoost Tutortial (Summary of ColocBoost results)} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -ColocBoost: Multi-omics xQTL colocalization improves the discovery of causal variants for complex diseases. - - - -```{r setup} -library(colocboost) -``` - - -## Get summary table (with or without target phenotype) - -## Plot ColocBoost results - -## Check others .....??? - - - - - diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index f1ec390..a472cac 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -18,44 +18,51 @@ knitr::opts_chunk$set( library(colocboost) ``` -## Input Data Format -This vignette documents the required input data formats and provides examples of data included in the package. +This vignette documents the standard input data formats of `colocboost`. -### Individual Level Data +## 1. Individual Level Data -For analyses using individual-level data, the package requires matched X and Y data. Below is the format and an example from the package: +For analyses using individual-level data, the basic format for single trait is as follows: -```{r individual-level-example} -# Load example individual-level data -data(Ind_5traits) +- `X` is an N * P matrix with N individuals and P variants. Including variant names as column names is highly recommended, especially when working with multiple X matrices and Y vectors. +- `Y` is a length N vector containing phenotype values for the same N individuals with X. -# Display the structure -str(Ind_5traits) -``` +The input format for multiple traits is similar, but `X` matrix should be a list of matrices, each corresponding to a different trait. `Y` vector should also be a list of vectors. +For example: + +- `X = list(X1, X2, X3, X4, X5)` where each `Xi` is a matrix for trait `i` - with the dimension of Ni * Pi, where Ni and Pi do not need to be the same for different traits. +- `Y = list(Y1, Y2, Y3, Y4, Y5)` where each `Yi` is a vector for trait `i` - with Ni individuals. + + +`colocboost` also offers flexible input options (see detailed usage with different input formats, +refer to [Individual Data Colocalization](https://statfungen.github.io/colocboost/articles/Individual_Level_Colocalization.html).): -#### Format Requirements +- Single X matrix with N * P with Y matrix with N * L for L traits. +- Multiple X matrices and unmatched Y vectors with a mapping dictionary. -- Data should be in a data frame or matrix -- Each row represents an individual -- Columns must include matched genotype (X) and phenotype (Y) data -- Missing values should be coded as NA -### Summary Statistics +## 2. Summary Statistics -For analyses using summary statistics, the package requires a data frame with matched linkage disequilibrium (LD) information: +For analyses using summary statistics, the basic format for single trait is as follows: +- `sumstat` is a data frame with required columns `z` or (`beta`, `sebeta`), and optional columns but highly recommended `n` and `variant`. ```{r summary-stats-example} -# Load example summary statistics data data(Sumstat_5traits) - -# Display the structure -str(Sumstat_5traits) +head(Sumstat_5traits$sumstat[[1]]) ``` -#### Format Requirements + - `z` or (`beta`, `sebeta`) - required: either z-score or (effect size and standard error) + - `n` - highly recommended: sample size for the summary statistics, it is highly recommendation to provide. + - `variant` - highly recommended: required if sumstat for different outcomes do not have the same number of variables (multiple sumstat and multiple LD). + + +- `LD` is a matrix of LD. This matrix does not need to contain the exact same variants as in `sumstat`, but the `colnames` and `rownames` of `LD` should include the `variant` names for proper alignment. + +The input format for multiple traits is similar, but `sumstat` should be a list of data frames `sumstat = list(sumstat1, sumstat2, sumstat3)`. +The flexibility of input format for multiple traits is as follows (see detailed usage with different input formats, +refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html).):: -- Data should be in a data frame -- Each row represents a variant -- Must include effect size, standard error, and sample size information -- LD matrix must be provided separately or calculated from the data +- One LD matrix with a superset of variants in `sumstat` for all traits is allowed. +- Multiple LD matrices, each corresponding to a different trait, are also allowed for the trait-specific LD structure. +- Multiple LD matrices and unmatched `sumstat` data frames with a mapping dictionary are also allowed. diff --git a/vignettes/Interpret_ColocBoost_Output.Rmd b/vignettes/Interpret_ColocBoost_Output.Rmd new file mode 100644 index 0000000..ff48277 --- /dev/null +++ b/vignettes/Interpret_ColocBoost_Output.Rmd @@ -0,0 +1,101 @@ +--- +title: "Interpret ColocBoost Output" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Interpret ColocBoost Output} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + + +This vignette demonstrates how to interpret the output of ColocBoost, specifically to get the summary of colocalization and filtering our the weak signals. + +```{r setup} +library(colocboost) +``` + +## 1. Look at summary of colocalization + +### Causal variant structure +The dataset features two causal variants with indices 644 and 2289. + +- Causal variant 644 is associated with traits 1, 2, 3, and 4. +- Causal variant 2289 is associated with traits 2, 3, and 5. + +```{r run-colocboost} +# Loading the Dataset +data(Ind_5traits) +# Run colocboost +res <- colocboost(X = Ind_5traits$X, Y = Ind_5traits$Y) +cos_summary <- res$cos_summary +names(cos_summary) +``` + +The `cos_summary` object contains the colocalization summary for all colocalization events, with each row representing a single colocalization event. +The summary includes the following columns: + + +- **focal_outcome**: The focal outcome being analyzed, or `FALSE` if no focal outcome exists. +- **colocalized_outcomes**: Traits that are colocalized within the 95% colocalization confidence set (CoS). +- **cos_id**: A unique identifier for each 95% colocalization confidence set (CoS). +- **purity**: The minimum absolute correlation of variants within the 95% colocalization confidence set (CoS). +- **top_variable**: The variable with the highest variant colocalization probability (VCP). +- **top_variable_vcp**: The variant colocalization probability for the top variable. +- **cos_npc**: The normalized probability of colocalization for the 95% confidence set, providing empirical evidence in favor of colocalization over a trait-specific configuration. +- **min_npc_outcome**: The minimum normalized probability among colocalized traits. +- **n_variables**: The number of variables in the 95% colocalization confidence set (CoS). +- **colocalized_index**: The indices of colocalized variables. +- **colocalized_variables**: A list of colocalized variables. +- **colocalized_variables_vcp**: The variant colocalization probabilities for all colocalized variables. + + +To obtain the summary of colocalization with a specific focus on traits of interest, +you can use the `get_cos_summary`, see detailed usage of this function in [link](https://statfungen.github.io/colocboost/reference/get_cos_summary.html). +This function allows you to filter the colocalization summary based on a particular outcome of interest, +making it easier to interpret the results for specific traits. +For example, if you are interested in the colocalization events involving the traits `Y1` and `Y2`, you can use the following code: + + +```{r summary-colocboost} +# Get summary table of colocalization +cos_interest_outcome <- get_cos_summary(res, interest_outcome = c("Y1", "Y2")) +``` + + + +## 2. Get strong colocalization signals + +In `cos_summary`, for each 95% CoS, the `cos_npc` column provides a normalized probability of colocalization and +`min_npc_outcome` column provides the minimum normalized probability among colocalized traits. +Those two metrices are measured as an empirical evidence of colocalization both in CoS-level and in trait-level. +To obtain the best minimal colocalization configuration can be defined by using both `cos_npc` and `npc_outcome` +See detailed usage of this function in [link](https://statfungen.github.io/colocboost/reference/get_strong_colocalization.html). + + +```{r run-strong-colocalization} +filter_res <- get_strong_colocalization(res, cos_npc_cutoff = 0.5, npc_outcome_cutoff = 0.2) +``` + +The output from `get_strong_colocalization` is the same as output from `colocboost`, which can be directly used in any post inference and visualization. + + +## 3. Details of ColocBoost output + +The entire colocalization output from `colocboost` is stored in the `colocboost` object, which contains several components: + +- `cos_summary`: A summary table for colocalization events. +- `vcp`: The variable colocalized probability for each variable. +- `cos_details`: A object with all information for colocalization results. +- `data_info`: A object with detailed information from input data. +- `model_info`: A object with detailed information for colocboost model. + + + + diff --git a/vignettes/Summary_Level_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd similarity index 98% rename from vignettes/Summary_Level_Colocalization.Rmd rename to vignettes/Summary_Statistics_Colocalization.Rmd index 59e5769..396535a 100644 --- a/vignettes/Summary_Level_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -1,8 +1,8 @@ --- -title: "Summary Level Data Colocalization" +title: "Summary Statistics Colocalization" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Summary Level Data Colocalization} + %\VignetteIndexEntry{Summary Statistics Colocalization} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/Visualization_ColocBoost_Output.Rmd b/vignettes/Visualization_ColocBoost_Output.Rmd new file mode 100644 index 0000000..1abeced --- /dev/null +++ b/vignettes/Visualization_ColocBoost_Output.Rmd @@ -0,0 +1,102 @@ +--- +title: "Visualization of ColocBoost Results" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Visualization of ColocBoost Results} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This vignette demonstrates how to visualize and interpret the output of ColocBoost results. + +```{r setup} +library(colocboost) +``` + +**Causal variant structure** + +The dataset features two causal variants with indices 644 and 2289. + +- Causal variant 644 is associated with traits 1, 2, 3, and 4. +- Causal variant 2289 is associated with traits 2, 3, and 5. + +```{r run-colocboost} +# Loading the Dataset +data(Ind_5traits) +# Run colocboost +res <- colocboost(X = Ind_5traits$X, Y = Ind_5traits$Y) +``` + +## 1. Default plot and interpretation + +The default plot of the colocboost results provides a visual representation of the colocalization events. + +- The x-axis indicates the indices of variants and y-axis indicates the -log10(p-value) from marginal associations. +- The colors of the points represent the colocalization events, with different colors indicating different colocalization events. + +```{r basic-plot} +colocboost_plot(res) +``` + +**Parameters to adjust plot** + +- `plot_cols = 2` (default) indicates the number of columns in the plot. +- `y = "log10p"` (default) with optional + - `y = "z_original"` for z-scores + - `y = "vcp"` for variant colocalization probabilities, + - `y = "coef"` for regression coefficients estimated from the colocboost model. +- `plot_cos_idx = NULL` (default) indicates all colocalization events are plotted. `plot_cos_idx = 1` can be specified to plot the 1st colocalization event, and so on. +- `outcome_idx = NULL` (default) indicates only the traits with colocalization are plotted. `outcome_idx = c(1,2,5)` can be specified to plot the traits 1, 2, and 5. +- `cos_color = NULL` (default) indicates the colors of the colocalization events. Specify a vector of colors to customize the plot. + + +## 2. Advanced options + +There are several advanced options available for customizing the plot by deepening the visualization of the colocboost results. + +### 2.1. Plot with a zoom-in region + +You can specify a zoom-in region by providing a `grange` argument, which is a vector of length 2 indicating the start and end indices of the region to be zoomed in. + +```{r zoomin-plot} +colocboost_plot(res, grange = c(1:1000)) +``` + + +### 2.2. Plot with marked top variants + +You can highlight the top variants in the plot by setting `show_top_variables = TRUE`. This will add a red circle to top variants with highest VCP for each CoS. + +```{r top-plot} +colocboost_plot(res, show_top_variables = TRUE) +``` + + +### 2.3. Plot CoS variants to uncolocalized traits to diagnostic the colocalization. + +There are three options available for plotting the CoS variants to uncolocalized traits: +- `show_cos_to_uncoloc = FALSE` (default), if `TRUE` will plot all CoS variants to all uncolocalized traits. +- `show_cos_to_uncoloc_idx = NULL` (default), if specified, will plot the specified CoS variants to all uncolocalized traits. +- `show_cos_to_uncoloc_outcome = NULL` (default), if specified, will plot the all CoS variants to the specified uncolocalized traits. + +```{r ucos-plot} +colocboost_plot(res, show_cos_to_uncoloc = TRUE) +``` + +### 2.4. Plot with an added vertical line + +You can add a vertical line to the plot by setting `add_vertical = TRUE` and `add_vertical_idx = **`. This will add a vertical line at the specified index. +For example, to add a vertical line at true causal variants, you can set `add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants))`. +Following plot also shows the top variants. + + +```{r vertical-plot} +colocboost_plot(res, show_top_variables = TRUE, add_vertical = TRUE, add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants))) +```