Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 55 additions & 21 deletions R/colocboost_inference.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,34 +120,67 @@ get_hierarchical_clusters <- function(cormat, min_cluster_corr = 0.8) {
#' @noRd
get_modularity <- function(Weight, B) {
if (dim(Weight)[1] == 1) return(0)

if (nrow(B) != nrow(Weight)) {
stop("B must have one row per variable in Weight")
}
if (any(rowSums(B) == 0)) {
stop("Each row of B must belong to one cluster")
}
index <- max.col(B, ties.method = "first")
get_modularity_partition(get_modularity_components(Weight), index, ncol(B))
}

get_modularity_components <- function(Weight) {
W_pos <- Weight * (Weight > 0)
W_neg <- Weight * (Weight < 0)
N <- dim(Weight)[1]
K_pos <- colSums(W_pos)
K_neg <- colSums(W_neg)
m_pos <- sum(K_pos)
m_neg <- sum(K_neg)
m <- m_pos + m_neg

if (m == 0) return(0)

# cate <- B %*% t(B)
cate <- tcrossprod(B)

if (m_pos == 0 & m_neg == 0) return(0)

if (m_pos == 0) {
list(
W_pos = W_pos,
W_neg = W_neg,
K_pos = K_pos,
K_neg = K_neg,
m_pos = m_pos,
m_neg = m_neg,
m = m_pos + m_neg
)
}

get_modularity_partition <- function(mod_components, index, n_cluster) {
if (nrow(mod_components$W_pos) == 1) return(0)
if (mod_components$m == 0) return(0)

index_factor <- factor(index, levels = seq_len(n_cluster))
block_sum <- function(W) {
row_sums <- rowsum(W, index_factor, reorder = TRUE)
vapply(seq_len(n_cluster), function(i) sum(row_sums[i, index == i]), numeric(1))
}
group_sum <- function(x) {
as.numeric(rowsum(matrix(x, ncol = 1), index_factor, reorder = TRUE))
}

A_pos <- block_sum(mod_components$W_pos)
A_neg <- block_sum(mod_components$W_neg)
D_pos <- group_sum(mod_components$K_pos)
D_neg <- group_sum(mod_components$K_neg)

if (mod_components$m_pos == 0 & mod_components$m_neg == 0) return(0)

if (mod_components$m_pos == 0) {
Q_positive <- 0
Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
} else if (m_neg == 0) {
Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
Q_negative <- sum(A_neg - D_neg^2 / mod_components$m_neg) / mod_components$m_neg
} else if (mod_components$m_neg == 0) {
Q_positive <- sum(A_pos - D_pos^2 / mod_components$m_pos) / mod_components$m_pos
Q_negative <- 0
} else {
Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
Q_positive <- sum(A_pos - D_pos^2 / mod_components$m_pos) / mod_components$m_pos
Q_negative <- sum(A_neg - D_neg^2 / mod_components$m_neg) / mod_components$m_neg
}
Q <- m_pos / m * Q_positive - m_neg / m * Q_negative

Q <- mod_components$m_pos / mod_components$m * Q_positive -
mod_components$m_neg / mod_components$m * Q_negative
return(Q)
}

Expand All @@ -165,15 +198,16 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) {
IND <- 1
Q <- 1
} else {
Q <- c()
Q <- numeric(m)
if (ncol(Sigma) < 10) {
m <- ncol(Sigma)
Q <- numeric(m)
}
mod_components <- get_modularity_components(Sigma)
for (i in 1:m)
{
index <- cutree(hc, i)
B <- sapply(1:i, function(t) as.numeric(index == t))
Q[i] <- get_modularity(Sigma, B)
Q[i] <- get_modularity_partition(mod_components, index, i)
}

IND <- which(Q == max(Q))
Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/helper-plot-device.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.colocboost_plot_device_env <- new.env(parent = emptyenv())
.colocboost_plot_device_env$file <- tempfile("colocboost-test-plots-", fileext = ".pdf")
grDevices::pdf(.colocboost_plot_device_env$file)

reg.finalizer(.colocboost_plot_device_env, function(e) {
while (grDevices::dev.cur() > 1) {
grDevices::dev.off()
}
unlink(e$file)
unlink("Rplots.pdf")
unlink(file.path("tests", "testthat", "Rplots.pdf"))
}, onexit = TRUE)
72 changes: 71 additions & 1 deletion tests/testthat/test_inference.R
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,76 @@ test_that("get_hierarchical_clusters functions correctly", {
expect_equal(result_all_high$n_cluster, 1)
})

test_that("block-sum modularity path matches reference modularity", {
set.seed(20260525)

get_modularity_reference <- function(Weight, B) {
if (dim(Weight)[1] == 1) return(0)
W_pos <- Weight * (Weight > 0)
W_neg <- Weight * (Weight < 0)
K_pos <- colSums(W_pos)
K_neg <- colSums(W_neg)
m_pos <- sum(K_pos)
m_neg <- sum(K_neg)
m <- m_pos + m_neg
if (m == 0) return(0)
cate <- tcrossprod(B)
if (m_pos == 0 & m_neg == 0) return(0)
if (m_pos == 0) {
Q_positive <- 0
Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
} else if (m_neg == 0) {
Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
Q_negative <- 0
} else {
Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos
Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg
}
m_pos / m * Q_positive - m_neg / m * Q_negative
}

get_n_cluster_reference <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) {
if (min(Sigma) > min_cluster_corr) {
return(list(n_cluster = 1, Qmodularity = 1))
}
if (ncol(Sigma) < 10) {
m <- ncol(Sigma)
}
Q <- numeric(m)
for (i in seq_len(m)) {
index <- cutree(hc, i)
B <- sapply(seq_len(i), function(t) as.numeric(index == t))
Q[i] <- get_modularity_reference(Sigma, B)
}
IND <- which(Q == max(Q))
if (length(IND) > 1) IND <- IND[1]
list(n_cluster = IND, Qmodularity = Q)
}

for (p in c(8, 20, 50)) {
Sigma <- matrix(rnorm(p * p), p, p)
Sigma <- (Sigma + t(Sigma)) / 2
diag(Sigma) <- 1
hc <- hclust(as.dist(1 - Sigma))

current <- get_n_cluster(hc, Sigma, m = p, min_cluster_corr = 0.999)
reference <- get_n_cluster_reference(hc, Sigma, m = p, min_cluster_corr = 0.999)

expect_equal(current$n_cluster, reference$n_cluster)
expect_equal(current$Qmodularity, reference$Qmodularity, tolerance = 1e-10)

for (k in c(1, ceiling(p / 3), p)) {
index <- cutree(hc, k)
B <- sapply(seq_len(k), function(t) as.numeric(index == t))
direct <- get_modularity_reference(Sigma, B)
components <- get_modularity_components(Sigma)
block_sum <- get_modularity_partition(components, index, k)
expect_equal(block_sum, direct, tolerance = 1e-10)
expect_equal(get_modularity(Sigma, B), direct, tolerance = 1e-10)
}
}
})


# Test get_ambiguous_colocalization function
test_that("get_ambiguous_colocalization identifies ambiguous colocalizations correctly", {
Expand Down Expand Up @@ -1176,4 +1246,4 @@ test_that("get_robust_ucos with different cutoffs produces expected ordering", {

# The number of remaining ucos should be monotonically non-increasing
expect_true(all(diff(n_ucos_remaining) <= 0))
})
})
12 changes: 6 additions & 6 deletions vignettes/ColocBoost_Wrapper_Pipeline.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost

Acknowledgment: Thanks to Kate (Kathryn) Lawrence (GitHub:@kal26) for her contributions to this vignette.

# 1. Loading Data using `colocboost_analysis_pipeline` function
# 1. Loading Data using `colocboost_pipeline` function

This function harmonizes the input data and prepares it for colocalization analysis.

Expand All @@ -32,7 +32,7 @@ In this section, we introduce how to load the regional data required for the Col
This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data), summary statistics
(sumstats, LD), or a combination of both. It runs `load_regional_univariate_data` for each individual-level dataset and `load_rss_data` for each summary statistics dataset.
The output is a list with `individual_data` and `sumstat_data` components, where `individual_data` is a list of individual-level data and `sumstat_data` is a list of summary statistics data.
This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis.
This list is then passed to the `colocboost_pipeline` function for the colocalization analysis.


Below are the input parameters for this function for loading individual-level data:
Expand Down Expand Up @@ -199,9 +199,9 @@ The LD metadata file is a tab-separated file with the following columns:
```


# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function
# 2. Perform ColocBoost using `colocboost_pipeline` function

In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function.
In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_pipeline` function.
The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data):

- **`xQTL-only mode`**: Only perform colocalization analysis on the individual-level data. Summary statistics data is not used.
Expand All @@ -226,7 +226,7 @@ Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern =

Outputs:

- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.

```{r, colocboost-analysis, eval = FALSE}
#### Please check the example code below ####
Expand Down Expand Up @@ -263,7 +263,7 @@ pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
qc_method = "rss_qc"

# run colocboost analysis
colocboost_results <- colocboost_analysis_pipeline(
colocboost_results <- colocboost_pipeline(
region_data_combined,
maf_cutoff = maf_cutoff,
pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,
Expand Down