diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml index cd5c3358..4bf2dd7f 100644 --- a/.github/recipe/recipe.yaml +++ b/.github/recipe/recipe.yaml @@ -30,6 +30,7 @@ requirements: - bioconductor-snpstats - r-base - r-bglr + - r-bigsnpr - r-coda - r-coloc - r-colocboost @@ -73,6 +74,7 @@ requirements: - bioconductor-snpstats - r-base - r-bglr + - r-bigsnpr - r-coda - r-coloc - r-colocboost diff --git a/DESCRIPTION b/DESCRIPTION index 0bac8360..c44b9254 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,6 +37,8 @@ Imports: vroom Suggests: BGLR, + bigsnpr, + bigstatsr, coda, colocboost, GBJ, diff --git a/NAMESPACE b/NAMESPACE index 0d964b26..9135a008 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ export(dentist) export(dentist_single_window) export(dpr_weights) export(enet_weights) +export(enforce_design_full_rank) export(extract_cs_info) export(extract_flatten_sumstats_from_nested) export(extract_top_pip_info) @@ -47,8 +48,10 @@ export(lasso_weights) export(lassosum_rss) export(lassosum_rss_weights) export(lbf_to_alpha) +export(ld_clump_by_score) export(ld_loader) export(ld_mismatch_qc) +export(ld_prune_by_correlation) export(load_LD_matrix) export(load_genotype_region) export(load_multicontext_sumstats) @@ -175,14 +178,18 @@ importFrom(readr,cols) importFrom(readr,parse_number) importFrom(readr,read_delim) importFrom(rlang,"!!!") +importFrom(stats,as.dist) importFrom(stats,coef) importFrom(stats,cor) +importFrom(stats,cutree) +importFrom(stats,hclust) importFrom(stats,lm.fit) importFrom(stats,pchisq) importFrom(stats,pnorm) importFrom(stats,predict) importFrom(stats,sd) importFrom(stats,setNames) +importFrom(stats,var) importFrom(stringr,str_detect) importFrom(stringr,str_remove) importFrom(stringr,str_replace) diff --git a/R/LD.R b/R/LD.R index 517e642c..5269e781 100644 --- a/R/LD.R +++ b/R/LD.R @@ -914,3 +914,385 @@ check_ld <- function(R, method_applied = method_applied ) } + +#' Prune columns by pairwise correlation (LD-style prune) +#' +#' Performs single-linkage hierarchical clustering on a correlation-distance +#' matrix (1 - |cor(X)|) and keeps one representative column per cluster at the +#' given correlation threshold. Uses \code{Rfast::cora} when available for a +#' faster correlation computation on wide matrices. +#' +#' @param X Numeric matrix. Columns are the variables to prune (typically SNP +#' genotype dosages); rows are observations. +#' @param cor_thres Numeric in (0, 1). Absolute correlation threshold. +#' Columns whose pairwise |cor| exceeds this are grouped; one survivor is +#' kept per group. Default 0.8. +#' @param verbose Logical. If TRUE, print progress messages. Default FALSE. +#' +#' @return A list with: +#' \describe{ +#' \item{X.new}{Matrix containing the retained columns of \code{X}.} +#' \item{filter.id}{Integer vector of the column indices of \code{X} that +#' were retained (in original order).} +#' } +#' +#' @examples +#' set.seed(1) +#' X <- matrix(rnorm(100 * 5), 100, 5) +#' X[, 2] <- X[, 1] + rnorm(100, sd = 0.01) # near-duplicate of col 1 +#' res <- ld_prune_by_correlation(X, cor_thres = 0.9) +#' ncol(res$X.new) +#' +#' @importFrom stats as.dist hclust cutree cor +#' @export +ld_prune_by_correlation <- function(X, cor_thres = 0.8, verbose = FALSE) { + p <- ncol(X) + + if (requireNamespace("Rfast", quietly = TRUE)) { + cor.X <- Rfast::cora(X, large = TRUE) + } else { + cor.X <- cor(X) + } + Sigma.distance <- as.dist(1 - abs(cor.X)) + fit <- hclust(Sigma.distance, method = "single") + clusters <- cutree(fit, h = 1 - cor_thres) + groups <- unique(clusters) + ind.delete <- NULL + X.new <- X + filter.id <- seq_len(p) + for (ig in seq_along(groups)) { + temp.group <- which(clusters == groups[ig]) + if (length(temp.group) > 1) { + ind.delete <- c(ind.delete, temp.group[-1]) + } + } + ind.delete <- unique(ind.delete) + if (length(ind.delete) > 0) { + X.new <- as.matrix(X[, -ind.delete]) + filter.id <- filter.id[-ind.delete] + if (verbose) { + message("ld_prune_by_correlation: pruned ", length(ind.delete), + " of ", p, " columns at |cor| > ", cor_thres) + } + } else if (verbose) { + message("ld_prune_by_correlation: no columns pruned at |cor| > ", cor_thres) + } + + if (ncol(X.new) == 1) { + colnames(X.new) <- colnames(X)[-ind.delete] + } + + list(X.new = X.new, filter.id = filter.id) +} + +#' Drop collinear columns from a design matrix by a chosen strategy +#' +#' Given a numeric matrix \code{X} and a set of column names known to be +#' involved in linear dependencies, remove one column using one of three +#' strategies. Designed to be called iteratively by +#' \code{\link{enforce_design_full_rank}}, but can be used standalone. +#' +#' @param X Numeric matrix. Must have column names covering +#' \code{problematic_cols}. +#' @param problematic_cols Character vector of column names in \code{X} that +#' are candidates for removal. If empty, \code{X} is returned unchanged. +#' @param strategy One of \code{"correlation"} (remove the column with the +#' largest sum of absolute pairwise correlations among the candidates; +#' when only two candidates, one is picked at random), \code{"variance"} +#' (remove the lowest-variance candidate), or \code{"response_correlation"} +#' (remove the candidate whose correlation with \code{response} has the +#' smallest magnitude). +#' @param response Numeric vector required when \code{strategy = +#' "response_correlation"}; the outcome to correlate against. +#' @param verbose Logical. If TRUE, print which column was removed. Default +#' FALSE. +#' +#' @return \code{X} with exactly one column removed (or unchanged if +#' \code{problematic_cols} is empty). +#' +#' @examples +#' set.seed(1) +#' X <- matrix(rnorm(100 * 3), 100, 3) +#' X[, 3] <- X[, 1] + X[, 2] +#' colnames(X) <- c("a", "b", "c") +#' drop_collinear_columns(X, problematic_cols = c("a", "b", "c"), +#' strategy = "variance") +#' +#' @importFrom stats var cor +#' @keywords internal +#' @noRd +drop_collinear_columns <- function(X, problematic_cols, + strategy = c("correlation", "variance", "response_correlation"), + response = NULL, verbose = FALSE) { + strategy <- match.arg(strategy) + + if (length(problematic_cols) == 0) { + return(X) + } + + if (length(problematic_cols) == 1) { + col_to_remove <- problematic_cols[1] + if (verbose) message("drop_collinear_columns: removing single column ", col_to_remove) + X <- X[, !(colnames(X) %in% col_to_remove), drop = FALSE] + return(X) + } + + if (strategy == "variance") { + variances <- apply(X[, problematic_cols, drop = FALSE], 2, var) + col_to_remove <- problematic_cols[which.min(variances)] + if (verbose) message("drop_collinear_columns: smallest variance -> removing ", col_to_remove) + } else if (strategy == "correlation") { + cor_matrix <- abs(cor(X[, problematic_cols, drop = FALSE])) + diag(cor_matrix) <- 0 + + if (length(problematic_cols) == 2) { + col_to_remove <- sample(problematic_cols, 1) + if (verbose) message("drop_collinear_columns: two candidates, randomly removing ", col_to_remove) + } else { + cor_sums <- colSums(cor_matrix) + col_to_remove <- problematic_cols[which.max(cor_sums)] + if (verbose) message("drop_collinear_columns: highest sum |cor| -> removing ", col_to_remove) + } + } else if (strategy == "response_correlation") { + if (is.null(response)) { + stop("response must be supplied for strategy = 'response_correlation'") + } + cor_with_response <- apply(X[, problematic_cols, drop = FALSE], 2, + function(col) cor(col, response)) + col_to_remove <- problematic_cols[which.min(abs(cor_with_response))] + if (verbose) message("drop_collinear_columns: smallest |cor| with response -> removing ", col_to_remove) + } + + X[, !(colnames(X) %in% col_to_remove), drop = FALSE] +} + +#' Iteratively enforce full column rank on a design matrix +#' +#' Given a candidate predictor matrix \code{X} and an optional unnamed +#' covariate matrix \code{C}, builds the design \code{[1, X, C]} and removes +#' rank-deficient columns from \code{X} until the design has full column rank. +#' Rank-deficient columns are identified via the pivot of +#' \code{qr([1, X, C])}. On each iteration, one problematic column is dropped +#' using \code{\link{drop_collinear_columns}}. If iterative pruning does not +#' achieve full rank, falls back to \code{\link{ld_prune_by_correlation}} at a +#' descending sequence of correlation thresholds. +#' +#' @param X Numeric matrix with column names (the predictors subject to +#' pruning). +#' @param C Numeric matrix of covariates (can be unnamed) that will be kept. +#' Pass \code{NULL} or a zero-column matrix when there are no covariates. +#' @param strategy Passed through to \code{\link{drop_collinear_columns}}. +#' @param response Passed through to \code{\link{drop_collinear_columns}} +#' when \code{strategy = "response_correlation"}. +#' @param max_iterations Integer. Hard cap on the iterative-prune loop. +#' Default 300. +#' @param corr_thresholds Numeric vector of |cor| thresholds used for the +#' \code{\link{ld_prune_by_correlation}} fallback, tried in order. +#' Default \code{seq(0.75, 0.5, by = -0.05)}. +#' @param verbose Logical. If TRUE, print per-iteration progress. Default +#' FALSE. +#' +#' @return The pruned predictor matrix \code{X} (covariates \code{C} are not +#' modified). +#' +#' @examples +#' set.seed(1) +#' X <- matrix(rnorm(100 * 4), 100, 4) +#' X[, 4] <- X[, 1] + X[, 2] # rank-deficient +#' colnames(X) <- c("a", "b", "c", "d") +#' C <- matrix(rnorm(100), 100, 1) +#' X2 <- enforce_design_full_rank(X, C, strategy = "variance") +#' qr(cbind(1, X2, C))$rank == ncol(cbind(1, X2, C)) +#' +#' @export +enforce_design_full_rank <- function(X, C, + strategy = c("correlation", "variance", "response_correlation"), + response = NULL, + max_iterations = 300L, + corr_thresholds = seq(0.75, 0.5, by = -0.05), + verbose = FALSE) { + strategy <- match.arg(strategy) + original_colnames <- colnames(X) + initial_ncol <- ncol(X) + iteration <- 0L + + build_design <- function(X) { + XD <- cbind(1, X, C) + colnames(XD)[seq_len(ncol(X) + 1L)] <- c("Intercept", colnames(X)) + XD + } + + X_design <- build_design(X) + matrix_rank <- qr(X_design)$rank + if (verbose) { + message("enforce_design_full_rank: initial rank ", matrix_rank, + " / ", ncol(X_design)) + } + + skip_iterative <- FALSE + + # Fast path: try removing all QR-pivot-flagged columns at once. + if (matrix_rank < ncol(X_design)) { + qrd <- qr(X_design) + problematic_cols <- qrd$pivot[(qrd$rank + 1L):ncol(X_design)] + problematic_colnames <- colnames(X_design)[problematic_cols] + problematic_colnames <- problematic_colnames[problematic_colnames %in% colnames(X)] + + if (length(problematic_colnames) > 0) { + X_temp <- X[, !(colnames(X) %in% problematic_colnames), drop = FALSE] + if (qr(build_design(X_temp))$rank == ncol(build_design(X_temp))) { + if (verbose) { + message("enforce_design_full_rank: full rank after batch-removing ", + length(problematic_colnames), " column(s)") + } + } else { + skip_iterative <- TRUE + if (verbose) { + message("enforce_design_full_rank: batch removal insufficient, ", + "skipping to correlation-pruning fallback") + } + } + } + } + + # Iterative path. + if (!skip_iterative) { + while (matrix_rank < ncol(X_design) && iteration < max_iterations) { + qrd <- qr(X_design) + problematic_cols <- qrd$pivot[(qrd$rank + 1L):ncol(X_design)] + problematic_colnames <- colnames(X_design)[problematic_cols] + problematic_colnames <- problematic_colnames[problematic_colnames %in% colnames(X)] + + if (length(problematic_colnames) == 0) break + + X <- drop_collinear_columns(X, problematic_colnames, strategy = strategy, + response = response, verbose = verbose) + + X_design <- build_design(X) + matrix_rank <- qr(X_design)$rank + iteration <- iteration + 1L + if (verbose) { + message("enforce_design_full_rank: iter ", iteration, + " rank ", matrix_rank, " / ", ncol(X_design)) + } + } + + if (iteration == max_iterations) { + warning("enforce_design_full_rank: max_iterations reached; design may still be rank-deficient") + } + } + + # Correlation-threshold fallback. + X_design <- build_design(X) + matrix_rank <- qr(X_design)$rank + if (matrix_rank < ncol(X_design)) { + if (verbose) { + message("enforce_design_full_rank: applying ld_prune_by_correlation fallback") + } + for (threshold in corr_thresholds) { + filter_result <- ld_prune_by_correlation(X, cor_thres = threshold, + verbose = verbose) + X <- filter_result$X.new + X_design <- build_design(X) + matrix_rank <- qr(X_design)$rank + if (verbose) { + message("enforce_design_full_rank: threshold ", threshold, + " -> rank ", matrix_rank, " / ", ncol(X_design)) + } + if (matrix_rank == ncol(X_design)) break + } + } + + if (ncol(X) == 1L && initial_ncol == 1L) { + colnames(X) <- original_colnames + } + X +} + +#' LD clumping by a per-variant score using bigsnpr +#' +#' Wraps \code{bigsnpr::snp_clumping} with the boilerplate of wrapping a +#' numeric dosage matrix into a \code{bigstatsr::FBM.code256} object and of +#' handling the common pitfall of a single-variant input. +#' +#' @param X Numeric matrix of 0/1/2 allele dosages, n rows by p variants. +#' Column names are expected to be variant IDs but are not required. +#' @param score Numeric vector of length \code{ncol(X)}. Higher values favour +#' retention during clumping (e.g. -log10 p, |Z|, MAF). May be \code{NULL}, +#' in which case bigsnpr falls back to minor allele frequency computed from +#' \code{X}. +#' @param chr Integer or character vector of length \code{ncol(X)} giving the +#' chromosome for each variant. +#' @param pos Integer vector of length \code{ncol(X)} giving the base-pair +#' position for each variant. +#' @param r2 Numeric in (0, 1]. r-squared threshold for clumping (variants +#' within \code{window_kb} whose r2 exceeds \code{r2} and have lower +#' \code{score} are removed). Default 0.2. +#' @param window_kb Numeric. Window size in kilobases. Default is +#' \code{100 / r2}, matching the common "ld-clump size = 100/r2" heuristic +#' used in many GWAS pipelines. +#' @param verbose Logical. If TRUE, print the number of retained variants. +#' Default FALSE. +#' +#' @return An integer vector of indices (into \code{X} columns) kept after +#' clumping. For a single-column \code{X}, returns \code{1L}. +#' +#' @examples +#' \dontrun{ +#' set.seed(1) +#' n <- 500; p <- 20 +#' X <- matrix(rbinom(n * p, 2, 0.3), n, p) +#' colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:G") +#' s <- runif(p) +#' chr <- rep(1L, p); pos <- seq_len(p) * 1000L +#' keep <- ld_clump_by_score(X, score = s, chr = chr, pos = pos, r2 = 0.2) +#' } +#' +#' @export +ld_clump_by_score <- function(X, score, chr, pos, r2 = 0.2, + window_kb = 100 / r2, verbose = FALSE) { + if (!requireNamespace("bigsnpr", quietly = TRUE)) { + stop("Package 'bigsnpr' is required. Install from CRAN: install.packages('bigsnpr')") + } + if (!requireNamespace("bigstatsr", quietly = TRUE)) { + stop("Package 'bigstatsr' is required. Install from CRAN: install.packages('bigstatsr')") + } + + if (ncol(X) < 1L) stop("ld_clump_by_score: X must have at least one column") + if (!is.null(score) && length(score) != ncol(X)) { + stop("ld_clump_by_score: length(score) must equal ncol(X)") + } + if (length(chr) != ncol(X) || length(pos) != ncol(X)) { + stop("ld_clump_by_score: chr and pos must have length equal to ncol(X)") + } + + if (ncol(X) == 1L) { + if (verbose) message("ld_clump_by_score: single variant, skipping clumping") + return(1L) + } + + if (inherits(X, "FBM")) { + G <- X + } else { + code_vec <- c(0, 1, 2, rep(NA, 256L - 3L)) + G <- bigstatsr::FBM.code256( + nrow = nrow(X), ncol = ncol(X), + init = X, code = code_vec + ) + } + + keep <- bigsnpr::snp_clumping( + G = G, + infos.chr = as.integer(chr), + infos.pos = as.integer(pos), + S = score, + thr.r2 = r2, + size = window_kb + ) + + if (verbose) { + message("ld_clump_by_score: ", length(keep), " / ", ncol(X), + " variants retained at r2 <= ", r2) + } + keep +} diff --git a/R/twas.R b/R/twas.R index 5b847012..595f0b4a 100644 --- a/R/twas.R +++ b/R/twas.R @@ -271,6 +271,19 @@ harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NUL #' \item{twas_table}{ A dataframe of twas results summary is generated for each gene-contexts-method pair of all methods for imputable genes.} #' \item{twas_data}{ A list of list containing formatted TWAS data.} #' } +# Shared shape for twas_analysis() result rows. Internal. +build_twas_score_row <- function(twas_rs, weight_db, context, study) { + if (is.null(twas_rs)) return(data.frame()) + data.frame( + gwas_study = study, + method = sub("_[^_]+$", "", names(twas_rs)), + twas_z = find_data(twas_rs, c(2, "z")), + twas_pval = find_data(twas_rs, c(2, "pval")), + context = context, + molecular_id = weight_db + ) +} + #' @importFrom stringr str_remove #' @export twas_pipeline <- function(twas_weights_data, @@ -482,11 +495,8 @@ twas_pipeline <- function(twas_weights_data, ) if (is.null(twas_rs)) { return(list(twas_rs_df = data.frame(), mr_rs_df = data.frame())) - } - twas_rs_df <- data.frame( - gwas_study = study, method = sub("_[^_]+$", "", names(twas_rs)), twas_z = find_data(twas_rs, c(2, "z")), - twas_pval = find_data(twas_rs, c(2, "pval")), context = context, molecular_id = weight_db - ) + } + twas_rs_df <- build_twas_score_row(twas_rs, weight_db, context, study) # MR analysis if (!is.null(twas_weights_data[[weight_db]]$susie_results) && any(na.omit(twas_rs_df$twas_pval) < mr_pval_cutoff) && diff --git a/man/twas_pipeline.Rd b/man/build_twas_score_row.Rd similarity index 70% rename from man/twas_pipeline.Rd rename to man/build_twas_score_row.Rd index c0a9a62f..81bb1240 100644 --- a/man/twas_pipeline.Rd +++ b/man/build_twas_score_row.Rd @@ -1,27 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/twas.R -\name{twas_pipeline} -\alias{twas_pipeline} +\name{build_twas_score_row} +\alias{build_twas_score_row} \title{Function to perform TWAS analysis for across multiple contexts. This function peforms TWAS analysis for multiple contexts for imputable genes within an LD region and summarize the twas results.} \usage{ -twas_pipeline( - twas_weights_data, - ld_meta_file_path, - gwas_meta_file, - region_block, - ld_reference_sample_size, - rsq_cutoff = 0.01, - rsq_pval_cutoff = 0.05, - rsq_option = c("rsq", "adj_rsq"), - rsq_pval_option = c("pval", "adj_rsq_pval"), - mr_pval_cutoff = 0.05, - mr_coverage_column = "cs_coverage_0.95", - output_twas_data = FALSE, - event_filters = NULL, - column_file_path = NULL, - comment_string = "#" -) +build_twas_score_row(twas_rs, weight_db, context, study) } \arguments{ \item{twas_weights_data}{List of list of twas weights output from generate_twas_db function.} diff --git a/man/enforce_design_full_rank.Rd b/man/enforce_design_full_rank.Rd new file mode 100644 index 00000000..1a063579 --- /dev/null +++ b/man/enforce_design_full_rank.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LD.R +\name{enforce_design_full_rank} +\alias{enforce_design_full_rank} +\title{Iteratively enforce full column rank on a design matrix} +\usage{ +enforce_design_full_rank( + X, + C, + strategy = c("correlation", "variance", "response_correlation"), + response = NULL, + max_iterations = 300L, + corr_thresholds = seq(0.75, 0.5, by = -0.05), + verbose = FALSE +) +} +\arguments{ +\item{X}{Numeric matrix with column names (the predictors subject to +pruning).} + +\item{C}{Numeric matrix of covariates (can be unnamed) that will be kept. +Pass \code{NULL} or a zero-column matrix when there are no covariates.} + +\item{strategy}{Passed through to \code{\link{drop_collinear_columns}}.} + +\item{response}{Passed through to \code{\link{drop_collinear_columns}} +when \code{strategy = "response_correlation"}.} + +\item{max_iterations}{Integer. Hard cap on the iterative-prune loop. +Default 300.} + +\item{corr_thresholds}{Numeric vector of |cor| thresholds used for the +\code{\link{ld_prune_by_correlation}} fallback, tried in order. +Default \code{seq(0.75, 0.5, by = -0.05)}.} + +\item{verbose}{Logical. If TRUE, print per-iteration progress. Default +FALSE.} +} +\value{ +The pruned predictor matrix \code{X} (covariates \code{C} are not + modified). +} +\description{ +Given a candidate predictor matrix \code{X} and an optional unnamed +covariate matrix \code{C}, builds the design \code{[1, X, C]} and removes +rank-deficient columns from \code{X} until the design has full column rank. +Rank-deficient columns are identified via the pivot of +\code{qr([1, X, C])}. On each iteration, one problematic column is dropped +using \code{\link{drop_collinear_columns}}. If iterative pruning does not +achieve full rank, falls back to \code{\link{ld_prune_by_correlation}} at a +descending sequence of correlation thresholds. +} +\examples{ +set.seed(1) +X <- matrix(rnorm(100 * 4), 100, 4) +X[, 4] <- X[, 1] + X[, 2] # rank-deficient +colnames(X) <- c("a", "b", "c", "d") +C <- matrix(rnorm(100), 100, 1) +X2 <- enforce_design_full_rank(X, C, strategy = "variance") +qr(cbind(1, X2, C))$rank == ncol(cbind(1, X2, C)) + +} diff --git a/man/ld_clump_by_score.Rd b/man/ld_clump_by_score.Rd new file mode 100644 index 00000000..de79f3cd --- /dev/null +++ b/man/ld_clump_by_score.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LD.R +\name{ld_clump_by_score} +\alias{ld_clump_by_score} +\title{LD clumping by a per-variant score using bigsnpr} +\usage{ +ld_clump_by_score( + X, + score, + chr, + pos, + r2 = 0.2, + window_kb = 100/r2, + verbose = FALSE +) +} +\arguments{ +\item{X}{Numeric matrix of 0/1/2 allele dosages, n rows by p variants. +Column names are expected to be variant IDs but are not required.} + +\item{score}{Numeric vector of length \code{ncol(X)}. Higher values favour +retention during clumping (e.g. -log10 p, |Z|, MAF). May be \code{NULL}, +in which case bigsnpr falls back to minor allele frequency computed from +\code{X}.} + +\item{chr}{Integer or character vector of length \code{ncol(X)} giving the +chromosome for each variant.} + +\item{pos}{Integer vector of length \code{ncol(X)} giving the base-pair +position for each variant.} + +\item{r2}{Numeric in (0, 1]. r-squared threshold for clumping (variants +within \code{window_kb} whose r2 exceeds \code{r2} and have lower +\code{score} are removed). Default 0.2.} + +\item{window_kb}{Numeric. Window size in kilobases. Default is +\code{100 / r2}, matching the common "ld-clump size = 100/r2" heuristic +used in many GWAS pipelines.} + +\item{verbose}{Logical. If TRUE, print the number of retained variants. +Default FALSE.} +} +\value{ +An integer vector of indices (into \code{X} columns) kept after + clumping. For a single-column \code{X}, returns \code{1L}. +} +\description{ +Wraps \code{bigsnpr::snp_clumping} with the boilerplate of wrapping a +numeric dosage matrix into a \code{bigstatsr::FBM.code256} object and of +handling the common pitfall of a single-variant input. +} +\examples{ +\dontrun{ + set.seed(1) + n <- 500; p <- 20 + X <- matrix(rbinom(n * p, 2, 0.3), n, p) + colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:G") + s <- runif(p) + chr <- rep(1L, p); pos <- seq_len(p) * 1000L + keep <- ld_clump_by_score(X, score = s, chr = chr, pos = pos, r2 = 0.2) +} + +} diff --git a/man/ld_prune_by_correlation.Rd b/man/ld_prune_by_correlation.Rd new file mode 100644 index 00000000..700222c2 --- /dev/null +++ b/man/ld_prune_by_correlation.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LD.R +\name{ld_prune_by_correlation} +\alias{ld_prune_by_correlation} +\title{Prune columns by pairwise correlation (LD-style prune)} +\usage{ +ld_prune_by_correlation(X, cor_thres = 0.8, verbose = FALSE) +} +\arguments{ +\item{X}{Numeric matrix. Columns are the variables to prune (typically SNP +genotype dosages); rows are observations.} + +\item{cor_thres}{Numeric in (0, 1). Absolute correlation threshold. +Columns whose pairwise |cor| exceeds this are grouped; one survivor is +kept per group. Default 0.8.} + +\item{verbose}{Logical. If TRUE, print progress messages. Default FALSE.} +} +\value{ +A list with: + \describe{ + \item{X.new}{Matrix containing the retained columns of \code{X}.} + \item{filter.id}{Integer vector of the column indices of \code{X} that + were retained (in original order).} + } +} +\description{ +Performs single-linkage hierarchical clustering on a correlation-distance +matrix (1 - |cor(X)|) and keeps one representative column per cluster at the +given correlation threshold. Uses \code{Rfast::cora} when available for a +faster correlation computation on wide matrices. +} +\examples{ +set.seed(1) +X <- matrix(rnorm(100 * 5), 100, 5) +X[, 2] <- X[, 1] + rnorm(100, sd = 0.01) # near-duplicate of col 1 +res <- ld_prune_by_correlation(X, cor_thres = 0.9) +ncol(res$X.new) + +} diff --git a/pixi.toml b/pixi.toml index 384c6b3c..5e92dfd2 100644 --- a/pixi.toml +++ b/pixi.toml @@ -48,6 +48,7 @@ r45 = {features = ["r45"]} "bioconductor-snpstats" = "*" "r-base" = "*" "r-bglr" = "*" +"r-bigsnpr" = "*" "r-coda" = "*" "r-coloc" = "*" "r-colocboost" = "*" diff --git a/tests/testthat/test_ld_prune.R b/tests/testthat/test_ld_prune.R new file mode 100644 index 00000000..a1c90231 --- /dev/null +++ b/tests/testthat/test_ld_prune.R @@ -0,0 +1,272 @@ +library(testthat) + +# =========================================================================== +# ld_prune_by_correlation +# =========================================================================== + +test_that("ld_prune_by_correlation removes highly correlated columns", { + set.seed(42) + n <- 50; p <- 10 + X <- matrix(rnorm(n * p), nrow = n) + colnames(X) <- paste0("v", 1:p) + X[, 2] <- X[, 1] + rnorm(n, sd = 0.01) + result <- ld_prune_by_correlation(X, cor_thres = 0.9) + expect_true(ncol(result$X.new) < p) + expect_equal(length(result$filter.id), ncol(result$X.new)) +}) + +test_that("ld_prune_by_correlation keeps all columns when uncorrelated", { + set.seed(42) + n <- 100; p <- 5 + X <- matrix(rnorm(n * p), nrow = n) + colnames(X) <- paste0("v", 1:p) + result <- ld_prune_by_correlation(X, cor_thres = 0.99) + expect_equal(ncol(result$X.new), p) + expect_equal(result$filter.id, 1:p) +}) + +test_that("ld_prune_by_correlation preserves colnames for single remaining column", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n * 3), nrow = n) + colnames(X) <- c("a", "b", "c") + X[, 2] <- X[, 1] + rnorm(n, sd = 0.001) + X[, 3] <- X[, 1] + rnorm(n, sd = 0.001) + result <- ld_prune_by_correlation(X, cor_thres = 0.5) + expect_true(ncol(result$X.new) >= 1) + expect_true(!is.null(colnames(result$X.new))) +}) + +test_that("ld_prune_by_correlation errors on single-column input", { + set.seed(42) + n <- 30 + X <- matrix(rnorm(n), nrow = n, ncol = 1) + colnames(X) <- "v1" + expect_error(ld_prune_by_correlation(X, cor_thres = 0.8)) +}) + +test_that("ld_prune_by_correlation strict threshold removes at least as many as lenient", { + set.seed(42) + n <- 100; p <- 5 + X <- matrix(rnorm(n * p), nrow = n) + colnames(X) <- paste0("v", 1:p) + X[, 2] <- X[, 1] + rnorm(n, sd = 0.1) + X[, 3] <- X[, 1] + rnorm(n, sd = 0.1) + X[, 5] <- X[, 4] + rnorm(n, sd = 0.1) + result_strict <- ld_prune_by_correlation(X, cor_thres = 0.3) + result_lenient <- ld_prune_by_correlation(X, cor_thres = 0.99) + expect_true(ncol(result_strict$X.new) <= ncol(result_lenient$X.new)) +}) + +test_that("ld_prune_by_correlation preserves colnames when no columns deleted", { + set.seed(42) + n <- 100; p <- 3 + X <- matrix(rnorm(n * p), nrow = n) + colnames(X) <- c("snp_a", "snp_b", "snp_c") + result <- ld_prune_by_correlation(X, cor_thres = 0.999) + expect_equal(colnames(result$X.new), colnames(X)) +}) + +test_that("ld_prune_by_correlation is silent by default, chatty with verbose", { + set.seed(1) + X <- matrix(rnorm(100), 20, 5) + colnames(X) <- paste0("v", 1:5) + X[, 2] <- X[, 1] + rnorm(20, sd = 1e-3) + expect_silent(ld_prune_by_correlation(X, cor_thres = 0.9)) + expect_message(ld_prune_by_correlation(X, cor_thres = 0.9, verbose = TRUE), + "ld_prune_by_correlation") +}) + +# =========================================================================== +# drop_collinear_columns +# =========================================================================== + +# drop_collinear_columns and enforce_design_full_rank are unexported helpers; +# access them via pecotmr::: in these tests. + +test_that("drop_collinear_columns returns X unchanged when problematic_cols is empty", { + X <- matrix(rnorm(100), nrow = 20, ncol = 5) + colnames(X) <- paste0("v", 1:5) + result <- pecotmr:::drop_collinear_columns(X, problematic_cols = character(0), strategy = "correlation") + expect_equal(ncol(result), 5) +}) + +test_that("drop_collinear_columns removes single problematic column", { + X <- matrix(rnorm(100), nrow = 20, ncol = 5) + colnames(X) <- paste0("v", 1:5) + result <- pecotmr:::drop_collinear_columns(X, problematic_cols = "v3", strategy = "correlation") + expect_equal(ncol(result), 4) + expect_false("v3" %in% colnames(result)) +}) + +test_that("drop_collinear_columns variance strategy removes lowest variance column", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n * 3), nrow = n, ncol = 3) + colnames(X) <- c("low_var", "mid_var", "high_var") + X[, 1] <- X[, 1] * 0.01 + X[, 3] <- X[, 3] * 10 + result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("low_var", "mid_var", "high_var"), + strategy = "variance") + expect_equal(ncol(result), 2) + expect_false("low_var" %in% colnames(result)) +}) + +test_that("drop_collinear_columns correlation strategy with two columns removes one", { + set.seed(42) + X <- matrix(rnorm(100), nrow = 20, ncol = 5) + colnames(X) <- paste0("v", 1:5) + result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2"), strategy = "correlation") + expect_equal(ncol(result), 4) +}) + +test_that("drop_collinear_columns correlation strategy with 3+ cols removes highest sum", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n * 4), nrow = n, ncol = 4) + colnames(X) <- paste0("v", 1:4) + X[, 2] <- X[, 1] + rnorm(n, sd = 0.01) + X[, 3] <- X[, 1] + rnorm(n, sd = 0.01) + result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2", "v3"), + strategy = "correlation") + expect_equal(ncol(result), 3) +}) + +test_that("drop_collinear_columns response_correlation strategy removes lowest |cor| with response", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n * 3), nrow = n, ncol = 3) + colnames(X) <- c("v1", "v2", "v3") + response <- X[, 1] * 2 + rnorm(n, sd = 0.1) + result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2", "v3"), + strategy = "response_correlation", + response = response) + expect_equal(ncol(result), 2) + expect_true("v1" %in% colnames(result)) +}) + +test_that("drop_collinear_columns errors on response_correlation without response", { + X <- matrix(rnorm(60), 20, 3) + colnames(X) <- c("v1", "v2", "v3") + expect_error( + pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2"), + strategy = "response_correlation"), + "response" + ) +}) + +test_that("drop_collinear_columns errors on invalid strategy", { + X <- matrix(rnorm(100), nrow = 20, ncol = 5) + colnames(X) <- paste0("v", 1:5) + expect_error( + pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2"), + strategy = "invalid_strategy"), + "arg" + ) +}) + +test_that("drop_collinear_columns preserves column name when single column remains", { + set.seed(42) + X <- matrix(rnorm(40), nrow = 20, ncol = 2) + colnames(X) <- c("keeper", "removed") + result <- pecotmr:::drop_collinear_columns(X, problematic_cols = "removed", strategy = "correlation") + expect_equal(ncol(result), 1) + expect_equal(colnames(result), "keeper") +}) + +test_that("drop_collinear_columns is silent by default", { + X <- matrix(rnorm(40), 20, 2) + colnames(X) <- c("a", "b") + expect_silent(pecotmr:::drop_collinear_columns(X, problematic_cols = "b", strategy = "correlation")) +}) + +# =========================================================================== +# enforce_design_full_rank (unexported) +# =========================================================================== + +test_that("enforce_design_full_rank returns full-rank matrix when already full rank", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n * 3), nrow = n, ncol = 3) + colnames(X) <- paste0("v", 1:3) + C <- matrix(rnorm(n * 2), nrow = n, ncol = 2) + result <- enforce_design_full_rank(X = X, C = C, strategy = "correlation") + expect_true(is.matrix(result)) + expect_true(ncol(result) >= 1) +}) + +test_that("enforce_design_full_rank handles rank-deficient design via correlation fallback", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n * 4), nrow = n, ncol = 4) + colnames(X) <- paste0("v", 1:4) + X[, 4] <- X[, 1] + X[, 2] + result <- enforce_design_full_rank(X = X, C = NULL, strategy = "correlation") + design <- cbind(1, result) + expect_equal(qr(design)$rank, ncol(design)) +}) + +test_that("enforce_design_full_rank preserves colname for single-column input", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n), nrow = n, ncol = 1) + colnames(X) <- "only_snp" + result <- enforce_design_full_rank(X = X, C = NULL, strategy = "correlation") + expect_equal(colnames(result), "only_snp") +}) + +test_that("enforce_design_full_rank is silent by default", { + set.seed(42) + n <- 50 + X <- matrix(rnorm(n * 3), n, 3) + colnames(X) <- paste0("v", 1:3) + expect_silent(enforce_design_full_rank(X = X, C = NULL, strategy = "correlation")) +}) + +# =========================================================================== +# ld_clump_by_score +# =========================================================================== + +test_that("ld_clump_by_score skips clumping on single-column input", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + set.seed(1) + X <- matrix(rbinom(100, 2, 0.3), 100, 1) + colnames(X) <- "chr1:100:A:G" + keep <- ld_clump_by_score(X, score = 1.0, chr = 1L, pos = 100L, r2 = 0.2) + expect_equal(keep, 1L) +}) + +test_that("ld_clump_by_score validates input lengths", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + set.seed(1) + X <- matrix(rbinom(100 * 3, 2, 0.3), 100, 3) + colnames(X) <- paste0("chr1:", seq_len(3) * 1000, ":A:G") + expect_error( + ld_clump_by_score(X, score = 1:2, chr = rep(1L, 3), pos = seq_len(3) * 1000L), + "score" + ) + expect_error( + ld_clump_by_score(X, score = 1:3, chr = rep(1L, 2), pos = seq_len(3) * 1000L), + "chr and pos" + ) +}) + +test_that("ld_clump_by_score returns indices on real data", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + set.seed(1) + n <- 500; p <- 20 + X <- matrix(rbinom(n * p, 2, 0.3), n, p) + colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:G") + # Introduce perfect LD between variants 1 and 2 + X[, 2] <- X[, 1] + score <- c(2, 1, runif(p - 2)) + chr <- rep(1L, p) + pos <- seq_len(p) * 1000L + keep <- ld_clump_by_score(X, score = score, chr = chr, pos = pos, r2 = 0.2) + expect_true(1L %in% keep) + expect_false(2L %in% keep) # pruned: same as variant 1 but lower score + expect_true(length(keep) < p) +}) diff --git a/tests/testthat/test_twas_pipeline_helpers.R b/tests/testthat/test_twas_pipeline_helpers.R new file mode 100644 index 00000000..5f01bfa5 --- /dev/null +++ b/tests/testthat/test_twas_pipeline_helpers.R @@ -0,0 +1,58 @@ +library(testthat) + +# build_twas_score_row is an unexported contract helper shared between +# twas_pipeline (pecotmr) and quantile_twas_pipeline (qQTLR). Tests cover the +# shapes returned by twas_analysis() so both pipelines stay aligned. + +test_that("build_twas_score_row returns empty data.frame for NULL input", { + out <- pecotmr:::build_twas_score_row(NULL, + weight_db = "ENSG001", + context = "Cortex", + study = "AD") + expect_s3_class(out, "data.frame") + expect_equal(nrow(out), 0) +}) + +test_that("build_twas_score_row packs a single-method twas_rs", { + # Shape mimics twas_analysis() output: apply(weights, 2, twas_z) returns + # a named list where each element is list(z=, pval=). The name's final + # "_" is the method marker (see the sub() regex). + # find_data(twas_rs, c(2, "z")) descends one level and pulls out "z". + twas_rs <- list( + enet_weights = list(z = 2.5, pval = 0.012) + ) + out <- pecotmr:::build_twas_score_row(twas_rs, + weight_db = "ENSG001", + context = "Cortex", + study = "AD") + expect_s3_class(out, "data.frame") + expect_equal(nrow(out), 1) + expect_setequal(colnames(out), + c("gwas_study", "method", "twas_z", "twas_pval", + "context", "molecular_id")) + expect_equal(out$gwas_study, "AD") + expect_equal(out$method, "enet") + expect_equal(out$twas_z, 2.5) + expect_equal(out$twas_pval, 0.012) + expect_equal(out$context, "Cortex") + expect_equal(out$molecular_id, "ENSG001") +}) + +test_that("build_twas_score_row packs a multi-method twas_rs", { + twas_rs <- list( + enet_weights = list(z = 2.5, pval = 0.012), + lasso_weights = list(z = -1.8, pval = 0.072), + top_weights = list(z = 0.4, pval = 0.689) + ) + out <- pecotmr:::build_twas_score_row(twas_rs, + weight_db = "ENSG002", + context = "Liver", + study = "T2D") + expect_equal(nrow(out), 3) + expect_equal(out$method, c("enet", "lasso", "top")) + expect_equal(out$twas_z, c(2.5, -1.8, 0.4)) + expect_equal(out$twas_pval, c(0.012, 0.072, 0.689)) + expect_true(all(out$gwas_study == "T2D")) + expect_true(all(out$context == "Liver")) + expect_true(all(out$molecular_id == "ENSG002")) +})