diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml
index cd5c3358..4bf2dd7f 100644
--- a/.github/recipe/recipe.yaml
+++ b/.github/recipe/recipe.yaml
@@ -30,6 +30,7 @@ requirements:
     - bioconductor-snpstats
     - r-base
     - r-bglr
+    - r-bigsnpr
     - r-coda
     - r-coloc
     - r-colocboost
@@ -73,6 +74,7 @@ requirements:
     - bioconductor-snpstats
     - r-base
     - r-bglr
+    - r-bigsnpr
     - r-coda
     - r-coloc
     - r-colocboost
diff --git a/DESCRIPTION b/DESCRIPTION
index 0bac8360..c44b9254 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -37,6 +37,8 @@ Imports:
     vroom
 Suggests:
     BGLR,
+    bigsnpr,
+    bigstatsr,
     coda,
     colocboost,
     GBJ,
diff --git a/NAMESPACE b/NAMESPACE
index 0d964b26..9135a008 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -24,6 +24,7 @@ export(dentist)
 export(dentist_single_window)
 export(dpr_weights)
 export(enet_weights)
+export(enforce_design_full_rank)
 export(extract_cs_info)
 export(extract_flatten_sumstats_from_nested)
 export(extract_top_pip_info)
@@ -47,8 +48,10 @@ export(lasso_weights)
 export(lassosum_rss)
 export(lassosum_rss_weights)
 export(lbf_to_alpha)
+export(ld_clump_by_score)
 export(ld_loader)
 export(ld_mismatch_qc)
+export(ld_prune_by_correlation)
 export(load_LD_matrix)
 export(load_genotype_region)
 export(load_multicontext_sumstats)
@@ -175,14 +178,18 @@ importFrom(readr,cols)
 importFrom(readr,parse_number)
 importFrom(readr,read_delim)
 importFrom(rlang,"!!!")
+importFrom(stats,as.dist)
 importFrom(stats,coef)
 importFrom(stats,cor)
+importFrom(stats,cutree)
+importFrom(stats,hclust)
 importFrom(stats,lm.fit)
 importFrom(stats,pchisq)
 importFrom(stats,pnorm)
 importFrom(stats,predict)
 importFrom(stats,sd)
 importFrom(stats,setNames)
+importFrom(stats,var)
 importFrom(stringr,str_detect)
 importFrom(stringr,str_remove)
 importFrom(stringr,str_replace)
diff --git a/R/LD.R b/R/LD.R
index 517e642c..5269e781 100644
--- a/R/LD.R
+++ b/R/LD.R
@@ -914,3 +914,385 @@ check_ld <- function(R,
     method_applied = method_applied
   )
 }
+
+#' Prune columns by pairwise correlation (LD-style prune)
+#'
+#' Performs single-linkage hierarchical clustering on a correlation-distance
+#' matrix (1 - |cor(X)|) and keeps one representative column per cluster at the
+#' given correlation threshold. Uses \code{Rfast::cora} when available for a
+#' faster correlation computation on wide matrices.
+#'
+#' @param X Numeric matrix. Columns are the variables to prune (typically SNP
+#'   genotype dosages); rows are observations.
+#' @param cor_thres Numeric in (0, 1). Absolute correlation threshold.
+#'   Columns whose pairwise |cor| exceeds this are grouped; one survivor is
+#'   kept per group. Default 0.8.
+#' @param verbose Logical. If TRUE, print progress messages. Default FALSE.
+#'
+#' @return A list with:
+#'   \describe{
+#'     \item{X.new}{Matrix containing the retained columns of \code{X}.}
+#'     \item{filter.id}{Integer vector of the column indices of \code{X} that
+#'       were retained (in original order).}
+#'   }
+#'
+#' @examples
+#' set.seed(1)
+#' X <- matrix(rnorm(100 * 5), 100, 5)
+#' X[, 2] <- X[, 1] + rnorm(100, sd = 0.01)   # near-duplicate of col 1
+#' res <- ld_prune_by_correlation(X, cor_thres = 0.9)
+#' ncol(res$X.new)
+#'
+#' @importFrom stats as.dist hclust cutree cor
+#' @export
+ld_prune_by_correlation <- function(X, cor_thres = 0.8, verbose = FALSE) {
+  p <- ncol(X)
+
+  if (requireNamespace("Rfast", quietly = TRUE)) {
+    cor.X <- Rfast::cora(X, large = TRUE)
+  } else {
+    cor.X <- cor(X)
+  }
+  Sigma.distance <- as.dist(1 - abs(cor.X))
+  fit <- hclust(Sigma.distance, method = "single")
+  clusters <- cutree(fit, h = 1 - cor_thres)
+  groups <- unique(clusters)
+  ind.delete <- NULL
+  X.new <- X
+  filter.id <- seq_len(p)
+  for (ig in seq_along(groups)) {
+    temp.group <- which(clusters == groups[ig])
+    if (length(temp.group) > 1) {
+      ind.delete <- c(ind.delete, temp.group[-1])
+    }
+  }
+  ind.delete <- unique(ind.delete)
+  if (length(ind.delete) > 0) {
+    X.new <- as.matrix(X[, -ind.delete])
+    filter.id <- filter.id[-ind.delete]
+    if (verbose) {
+      message("ld_prune_by_correlation: pruned ", length(ind.delete),
+              " of ", p, " columns at |cor| > ", cor_thres)
+    }
+  } else if (verbose) {
+    message("ld_prune_by_correlation: no columns pruned at |cor| > ", cor_thres)
+  }
+
+  if (ncol(X.new) == 1) {
+    colnames(X.new) <- colnames(X)[-ind.delete]
+  }
+
+  list(X.new = X.new, filter.id = filter.id)
+}
+
+#' Drop collinear columns from a design matrix by a chosen strategy
+#'
+#' Given a numeric matrix \code{X} and a set of column names known to be
+#' involved in linear dependencies, remove one column using one of three
+#' strategies. Designed to be called iteratively by
+#' \code{\link{enforce_design_full_rank}}, but can be used standalone.
+#'
+#' @param X Numeric matrix. Must have column names covering
+#'   \code{problematic_cols}.
+#' @param problematic_cols Character vector of column names in \code{X} that
+#'   are candidates for removal. If empty, \code{X} is returned unchanged.
+#' @param strategy One of \code{"correlation"} (remove the column with the
+#'   largest sum of absolute pairwise correlations among the candidates;
+#'   when only two candidates, one is picked at random), \code{"variance"}
+#'   (remove the lowest-variance candidate), or \code{"response_correlation"}
+#'   (remove the candidate whose correlation with \code{response} has the
+#'   smallest magnitude).
+#' @param response Numeric vector required when \code{strategy =
+#'   "response_correlation"}; the outcome to correlate against.
+#' @param verbose Logical. If TRUE, print which column was removed. Default
+#'   FALSE.
+#'
+#' @return \code{X} with exactly one column removed (or unchanged if
+#'   \code{problematic_cols} is empty).
+#'
+#' @examples
+#' set.seed(1)
+#' X <- matrix(rnorm(100 * 3), 100, 3)
+#' X[, 3] <- X[, 1] + X[, 2]
+#' colnames(X) <- c("a", "b", "c")
+#' drop_collinear_columns(X, problematic_cols = c("a", "b", "c"),
+#'                        strategy = "variance")
+#'
+#' @importFrom stats var cor
+#' @keywords internal
+#' @noRd
+drop_collinear_columns <- function(X, problematic_cols,
+                                   strategy = c("correlation", "variance", "response_correlation"),
+                                   response = NULL, verbose = FALSE) {
+  strategy <- match.arg(strategy)
+
+  if (length(problematic_cols) == 0) {
+    return(X)
+  }
+
+  if (length(problematic_cols) == 1) {
+    col_to_remove <- problematic_cols[1]
+    if (verbose) message("drop_collinear_columns: removing single column ", col_to_remove)
+    X <- X[, !(colnames(X) %in% col_to_remove), drop = FALSE]
+    return(X)
+  }
+
+  if (strategy == "variance") {
+    variances <- apply(X[, problematic_cols, drop = FALSE], 2, var)
+    col_to_remove <- problematic_cols[which.min(variances)]
+    if (verbose) message("drop_collinear_columns: smallest variance -> removing ", col_to_remove)
+  } else if (strategy == "correlation") {
+    cor_matrix <- abs(cor(X[, problematic_cols, drop = FALSE]))
+    diag(cor_matrix) <- 0
+
+    if (length(problematic_cols) == 2) {
+      col_to_remove <- sample(problematic_cols, 1)
+      if (verbose) message("drop_collinear_columns: two candidates, randomly removing ", col_to_remove)
+    } else {
+      cor_sums <- colSums(cor_matrix)
+      col_to_remove <- problematic_cols[which.max(cor_sums)]
+      if (verbose) message("drop_collinear_columns: highest sum |cor| -> removing ", col_to_remove)
+    }
+  } else if (strategy == "response_correlation") {
+    if (is.null(response)) {
+      stop("response must be supplied for strategy = 'response_correlation'")
+    }
+    cor_with_response <- apply(X[, problematic_cols, drop = FALSE], 2,
+                               function(col) cor(col, response))
+    col_to_remove <- problematic_cols[which.min(abs(cor_with_response))]
+    if (verbose) message("drop_collinear_columns: smallest |cor| with response -> removing ", col_to_remove)
+  }
+
+  X[, !(colnames(X) %in% col_to_remove), drop = FALSE]
+}
+
+#' Iteratively enforce full column rank on a design matrix
+#'
+#' Given a candidate predictor matrix \code{X} and an optional unnamed
+#' covariate matrix \code{C}, builds the design \code{[1, X, C]} and removes
+#' rank-deficient columns from \code{X} until the design has full column rank.
+#' Rank-deficient columns are identified via the pivot of
+#' \code{qr([1, X, C])}. On each iteration, one problematic column is dropped
+#' using \code{\link{drop_collinear_columns}}. If iterative pruning does not
+#' achieve full rank, falls back to \code{\link{ld_prune_by_correlation}} at a
+#' descending sequence of correlation thresholds.
+#'
+#' @param X Numeric matrix with column names (the predictors subject to
+#'   pruning).
+#' @param C Numeric matrix of covariates (can be unnamed) that will be kept.
+#'   Pass \code{NULL} or a zero-column matrix when there are no covariates.
+#' @param strategy Passed through to \code{\link{drop_collinear_columns}}.
+#' @param response Passed through to \code{\link{drop_collinear_columns}}
+#'   when \code{strategy = "response_correlation"}.
+#' @param max_iterations Integer. Hard cap on the iterative-prune loop.
+#'   Default 300.
+#' @param corr_thresholds Numeric vector of |cor| thresholds used for the
+#'   \code{\link{ld_prune_by_correlation}} fallback, tried in order.
+#'   Default \code{seq(0.75, 0.5, by = -0.05)}.
+#' @param verbose Logical. If TRUE, print per-iteration progress. Default
+#'   FALSE.
+#'
+#' @return The pruned predictor matrix \code{X} (covariates \code{C} are not
+#'   modified).
+#'
+#' @examples
+#' set.seed(1)
+#' X <- matrix(rnorm(100 * 4), 100, 4)
+#' X[, 4] <- X[, 1] + X[, 2]          # rank-deficient
+#' colnames(X) <- c("a", "b", "c", "d")
+#' C <- matrix(rnorm(100), 100, 1)
+#' X2 <- enforce_design_full_rank(X, C, strategy = "variance")
+#' qr(cbind(1, X2, C))$rank == ncol(cbind(1, X2, C))
+#'
+#' @export
+enforce_design_full_rank <- function(X, C,
+                                     strategy = c("correlation", "variance", "response_correlation"),
+                                     response = NULL,
+                                     max_iterations = 300L,
+                                     corr_thresholds = seq(0.75, 0.5, by = -0.05),
+                                     verbose = FALSE) {
+  strategy <- match.arg(strategy)
+  original_colnames <- colnames(X)
+  initial_ncol <- ncol(X)
+  iteration <- 0L
+
+  build_design <- function(X) {
+    XD <- cbind(1, X, C)
+    colnames(XD)[seq_len(ncol(X) + 1L)] <- c("Intercept", colnames(X))
+    XD
+  }
+
+  X_design <- build_design(X)
+  matrix_rank <- qr(X_design)$rank
+  if (verbose) {
+    message("enforce_design_full_rank: initial rank ", matrix_rank,
+            " / ", ncol(X_design))
+  }
+
+  skip_iterative <- FALSE
+
+  # Fast path: try removing all QR-pivot-flagged columns at once.
+  if (matrix_rank < ncol(X_design)) {
+    qrd <- qr(X_design)
+    problematic_cols <- qrd$pivot[(qrd$rank + 1L):ncol(X_design)]
+    problematic_colnames <- colnames(X_design)[problematic_cols]
+    problematic_colnames <- problematic_colnames[problematic_colnames %in% colnames(X)]
+
+    if (length(problematic_colnames) > 0) {
+      X_temp <- X[, !(colnames(X) %in% problematic_colnames), drop = FALSE]
+      if (qr(build_design(X_temp))$rank == ncol(build_design(X_temp))) {
+        if (verbose) {
+          message("enforce_design_full_rank: full rank after batch-removing ",
+                  length(problematic_colnames), " column(s)")
+        }
+      } else {
+        skip_iterative <- TRUE
+        if (verbose) {
+          message("enforce_design_full_rank: batch removal insufficient, ",
+                  "skipping to correlation-pruning fallback")
+        }
+      }
+    }
+  }
+
+  # Iterative path.
+  if (!skip_iterative) {
+    while (matrix_rank < ncol(X_design) && iteration < max_iterations) {
+      qrd <- qr(X_design)
+      problematic_cols <- qrd$pivot[(qrd$rank + 1L):ncol(X_design)]
+      problematic_colnames <- colnames(X_design)[problematic_cols]
+      problematic_colnames <- problematic_colnames[problematic_colnames %in% colnames(X)]
+
+      if (length(problematic_colnames) == 0) break
+
+      X <- drop_collinear_columns(X, problematic_colnames, strategy = strategy,
+                                  response = response, verbose = verbose)
+
+      X_design <- build_design(X)
+      matrix_rank <- qr(X_design)$rank
+      iteration <- iteration + 1L
+      if (verbose) {
+        message("enforce_design_full_rank: iter ", iteration,
+                " rank ", matrix_rank, " / ", ncol(X_design))
+      }
+    }
+
+    if (iteration == max_iterations) {
+      warning("enforce_design_full_rank: max_iterations reached; design may still be rank-deficient")
+    }
+  }
+
+  # Correlation-threshold fallback.
+  X_design <- build_design(X)
+  matrix_rank <- qr(X_design)$rank
+  if (matrix_rank < ncol(X_design)) {
+    if (verbose) {
+      message("enforce_design_full_rank: applying ld_prune_by_correlation fallback")
+    }
+    for (threshold in corr_thresholds) {
+      filter_result <- ld_prune_by_correlation(X, cor_thres = threshold,
+                                               verbose = verbose)
+      X <- filter_result$X.new
+      X_design <- build_design(X)
+      matrix_rank <- qr(X_design)$rank
+      if (verbose) {
+        message("enforce_design_full_rank: threshold ", threshold,
+                " -> rank ", matrix_rank, " / ", ncol(X_design))
+      }
+      if (matrix_rank == ncol(X_design)) break
+    }
+  }
+
+  if (ncol(X) == 1L && initial_ncol == 1L) {
+    colnames(X) <- original_colnames
+  }
+  X
+}
+
+#' LD clumping by a per-variant score using bigsnpr
+#'
+#' Wraps \code{bigsnpr::snp_clumping} with the boilerplate of wrapping a
+#' numeric dosage matrix into a \code{bigstatsr::FBM.code256} object and of
+#' handling the common pitfall of a single-variant input.
+#'
+#' @param X Numeric matrix of 0/1/2 allele dosages, n rows by p variants.
+#'   Column names are expected to be variant IDs but are not required.
+#' @param score Numeric vector of length \code{ncol(X)}. Higher values favour
+#'   retention during clumping (e.g. -log10 p, |Z|, MAF). May be \code{NULL},
+#'   in which case bigsnpr falls back to minor allele frequency computed from
+#'   \code{X}.
+#' @param chr Integer or character vector of length \code{ncol(X)} giving the
+#'   chromosome for each variant.
+#' @param pos Integer vector of length \code{ncol(X)} giving the base-pair
+#'   position for each variant.
+#' @param r2 Numeric in (0, 1]. r-squared threshold for clumping (variants
+#'   within \code{window_kb} whose r2 exceeds \code{r2} and have lower
+#'   \code{score} are removed). Default 0.2.
+#' @param window_kb Numeric. Window size in kilobases. Default is
+#'   \code{100 / r2}, matching the common "ld-clump size = 100/r2" heuristic
+#'   used in many GWAS pipelines.
+#' @param verbose Logical. If TRUE, print the number of retained variants.
+#'   Default FALSE.
+#'
+#' @return An integer vector of indices (into \code{X} columns) kept after
+#'   clumping. For a single-column \code{X}, returns \code{1L}.
+#'
+#' @examples
+#' \dontrun{
+#'   set.seed(1)
+#'   n <- 500; p <- 20
+#'   X <- matrix(rbinom(n * p, 2, 0.3), n, p)
+#'   colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:G")
+#'   s <- runif(p)
+#'   chr <- rep(1L, p); pos <- seq_len(p) * 1000L
+#'   keep <- ld_clump_by_score(X, score = s, chr = chr, pos = pos, r2 = 0.2)
+#' }
+#'
+#' @export
+ld_clump_by_score <- function(X, score, chr, pos, r2 = 0.2,
+                              window_kb = 100 / r2, verbose = FALSE) {
+  if (!requireNamespace("bigsnpr", quietly = TRUE)) {
+    stop("Package 'bigsnpr' is required. Install from CRAN: install.packages('bigsnpr')")
+  }
+  if (!requireNamespace("bigstatsr", quietly = TRUE)) {
+    stop("Package 'bigstatsr' is required. Install from CRAN: install.packages('bigstatsr')")
+  }
+
+  if (ncol(X) < 1L) stop("ld_clump_by_score: X must have at least one column")
+  if (!is.null(score) && length(score) != ncol(X)) {
+    stop("ld_clump_by_score: length(score) must equal ncol(X)")
+  }
+  if (length(chr) != ncol(X) || length(pos) != ncol(X)) {
+    stop("ld_clump_by_score: chr and pos must have length equal to ncol(X)")
+  }
+
+  if (ncol(X) == 1L) {
+    if (verbose) message("ld_clump_by_score: single variant, skipping clumping")
+    return(1L)
+  }
+
+  if (inherits(X, "FBM")) {
+    G <- X
+  } else {
+    code_vec <- c(0, 1, 2, rep(NA, 256L - 3L))
+    G <- bigstatsr::FBM.code256(
+      nrow = nrow(X), ncol = ncol(X),
+      init = X, code = code_vec
+    )
+  }
+
+  keep <- bigsnpr::snp_clumping(
+    G = G,
+    infos.chr = as.integer(chr),
+    infos.pos = as.integer(pos),
+    S = score,
+    thr.r2 = r2,
+    size = window_kb
+  )
+
+  if (verbose) {
+    message("ld_clump_by_score: ", length(keep), " / ", ncol(X),
+            " variants retained at r2 <= ", r2)
+  }
+  keep
+}
diff --git a/R/twas.R b/R/twas.R
index 5b847012..595f0b4a 100644
--- a/R/twas.R
+++ b/R/twas.R
@@ -271,6 +271,19 @@ harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NUL
 #'   \item{twas_table}{ A dataframe of twas results summary is generated for each gene-contexts-method pair of all methods for imputable genes.}
 #'   \item{twas_data}{ A list of list containing formatted TWAS data.}
 #' }
+# Shared shape for twas_analysis() result rows. Internal.
+build_twas_score_row <- function(twas_rs, weight_db, context, study) {
+  if (is.null(twas_rs)) return(data.frame())
+  data.frame(
+    gwas_study   = study,
+    method       = sub("_[^_]+$", "", names(twas_rs)),
+    twas_z       = find_data(twas_rs, c(2, "z")),
+    twas_pval    = find_data(twas_rs, c(2, "pval")),
+    context      = context,
+    molecular_id = weight_db
+  )
+}
+
 #' @importFrom stringr str_remove
 #' @export
 twas_pipeline <- function(twas_weights_data,
@@ -482,11 +495,8 @@ twas_pipeline <- function(twas_weights_data,
         )
         if (is.null(twas_rs)) {
           return(list(twas_rs_df = data.frame(), mr_rs_df = data.frame()))
-        }             
-        twas_rs_df <- data.frame(
-          gwas_study = study, method = sub("_[^_]+$", "", names(twas_rs)), twas_z = find_data(twas_rs, c(2, "z")),
-          twas_pval = find_data(twas_rs, c(2, "pval")), context = context, molecular_id = weight_db
-        )
+        }
+        twas_rs_df <- build_twas_score_row(twas_rs, weight_db, context, study)
         # MR analysis
         if (!is.null(twas_weights_data[[weight_db]]$susie_results) &&
           any(na.omit(twas_rs_df$twas_pval) < mr_pval_cutoff) &&
diff --git a/man/twas_pipeline.Rd b/man/build_twas_score_row.Rd
similarity index 70%
rename from man/twas_pipeline.Rd
rename to man/build_twas_score_row.Rd
index c0a9a62f..81bb1240 100644
--- a/man/twas_pipeline.Rd
+++ b/man/build_twas_score_row.Rd
@@ -1,27 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/twas.R
-\name{twas_pipeline}
-\alias{twas_pipeline}
+\name{build_twas_score_row}
+\alias{build_twas_score_row}
 \title{Function to perform TWAS analysis for across multiple contexts.
 This function peforms TWAS analysis for multiple contexts for imputable genes within an LD region and summarize the twas results.}
 \usage{
-twas_pipeline(
-  twas_weights_data,
-  ld_meta_file_path,
-  gwas_meta_file,
-  region_block,
-  ld_reference_sample_size,
-  rsq_cutoff = 0.01,
-  rsq_pval_cutoff = 0.05,
-  rsq_option = c("rsq", "adj_rsq"),
-  rsq_pval_option = c("pval", "adj_rsq_pval"),
-  mr_pval_cutoff = 0.05,
-  mr_coverage_column = "cs_coverage_0.95",
-  output_twas_data = FALSE,
-  event_filters = NULL,
-  column_file_path = NULL,
-  comment_string = "#"
-)
+build_twas_score_row(twas_rs, weight_db, context, study)
 }
 \arguments{
 \item{twas_weights_data}{List of list of twas weights output from generate_twas_db function.}
diff --git a/man/enforce_design_full_rank.Rd b/man/enforce_design_full_rank.Rd
new file mode 100644
index 00000000..1a063579
--- /dev/null
+++ b/man/enforce_design_full_rank.Rd
@@ -0,0 +1,62 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/LD.R
+\name{enforce_design_full_rank}
+\alias{enforce_design_full_rank}
+\title{Iteratively enforce full column rank on a design matrix}
+\usage{
+enforce_design_full_rank(
+  X,
+  C,
+  strategy = c("correlation", "variance", "response_correlation"),
+  response = NULL,
+  max_iterations = 300L,
+  corr_thresholds = seq(0.75, 0.5, by = -0.05),
+  verbose = FALSE
+)
+}
+\arguments{
+\item{X}{Numeric matrix with column names (the predictors subject to
+pruning).}
+
+\item{C}{Numeric matrix of covariates (can be unnamed) that will be kept.
+Pass \code{NULL} or a zero-column matrix when there are no covariates.}
+
+\item{strategy}{Passed through to \code{\link{drop_collinear_columns}}.}
+
+\item{response}{Passed through to \code{\link{drop_collinear_columns}}
+when \code{strategy = "response_correlation"}.}
+
+\item{max_iterations}{Integer. Hard cap on the iterative-prune loop.
+Default 300.}
+
+\item{corr_thresholds}{Numeric vector of |cor| thresholds used for the
+\code{\link{ld_prune_by_correlation}} fallback, tried in order.
+Default \code{seq(0.75, 0.5, by = -0.05)}.}
+
+\item{verbose}{Logical. If TRUE, print per-iteration progress. Default
+FALSE.}
+}
+\value{
+The pruned predictor matrix \code{X} (covariates \code{C} are not
+  modified).
+}
+\description{
+Given a candidate predictor matrix \code{X} and an optional unnamed
+covariate matrix \code{C}, builds the design \code{[1, X, C]} and removes
+rank-deficient columns from \code{X} until the design has full column rank.
+Rank-deficient columns are identified via the pivot of
+\code{qr([1, X, C])}. On each iteration, one problematic column is dropped
+using \code{\link{drop_collinear_columns}}. If iterative pruning does not
+achieve full rank, falls back to \code{\link{ld_prune_by_correlation}} at a
+descending sequence of correlation thresholds.
+}
+\examples{
+set.seed(1)
+X <- matrix(rnorm(100 * 4), 100, 4)
+X[, 4] <- X[, 1] + X[, 2]          # rank-deficient
+colnames(X) <- c("a", "b", "c", "d")
+C <- matrix(rnorm(100), 100, 1)
+X2 <- enforce_design_full_rank(X, C, strategy = "variance")
+qr(cbind(1, X2, C))$rank == ncol(cbind(1, X2, C))
+
+}
diff --git a/man/ld_clump_by_score.Rd b/man/ld_clump_by_score.Rd
new file mode 100644
index 00000000..de79f3cd
--- /dev/null
+++ b/man/ld_clump_by_score.Rd
@@ -0,0 +1,63 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/LD.R
+\name{ld_clump_by_score}
+\alias{ld_clump_by_score}
+\title{LD clumping by a per-variant score using bigsnpr}
+\usage{
+ld_clump_by_score(
+  X,
+  score,
+  chr,
+  pos,
+  r2 = 0.2,
+  window_kb = 100/r2,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{X}{Numeric matrix of 0/1/2 allele dosages, n rows by p variants.
+Column names are expected to be variant IDs but are not required.}
+
+\item{score}{Numeric vector of length \code{ncol(X)}. Higher values favour
+retention during clumping (e.g. -log10 p, |Z|, MAF). May be \code{NULL},
+in which case bigsnpr falls back to minor allele frequency computed from
+\code{X}.}
+
+\item{chr}{Integer or character vector of length \code{ncol(X)} giving the
+chromosome for each variant.}
+
+\item{pos}{Integer vector of length \code{ncol(X)} giving the base-pair
+position for each variant.}
+
+\item{r2}{Numeric in (0, 1]. r-squared threshold for clumping (variants
+within \code{window_kb} whose r2 exceeds \code{r2} and have lower
+\code{score} are removed). Default 0.2.}
+
+\item{window_kb}{Numeric. Window size in kilobases. Default is
+\code{100 / r2}, matching the common "ld-clump size = 100/r2" heuristic
+used in many GWAS pipelines.}
+
+\item{verbose}{Logical. If TRUE, print the number of retained variants.
+Default FALSE.}
+}
+\value{
+An integer vector of indices (into \code{X} columns) kept after
+  clumping. For a single-column \code{X}, returns \code{1L}.
+}
+\description{
+Wraps \code{bigsnpr::snp_clumping} with the boilerplate of wrapping a
+numeric dosage matrix into a \code{bigstatsr::FBM.code256} object and of
+handling the common pitfall of a single-variant input.
+}
+\examples{
+\dontrun{
+  set.seed(1)
+  n <- 500; p <- 20
+  X <- matrix(rbinom(n * p, 2, 0.3), n, p)
+  colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:G")
+  s <- runif(p)
+  chr <- rep(1L, p); pos <- seq_len(p) * 1000L
+  keep <- ld_clump_by_score(X, score = s, chr = chr, pos = pos, r2 = 0.2)
+}
+
+}
diff --git a/man/ld_prune_by_correlation.Rd b/man/ld_prune_by_correlation.Rd
new file mode 100644
index 00000000..700222c2
--- /dev/null
+++ b/man/ld_prune_by_correlation.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/LD.R
+\name{ld_prune_by_correlation}
+\alias{ld_prune_by_correlation}
+\title{Prune columns by pairwise correlation (LD-style prune)}
+\usage{
+ld_prune_by_correlation(X, cor_thres = 0.8, verbose = FALSE)
+}
+\arguments{
+\item{X}{Numeric matrix. Columns are the variables to prune (typically SNP
+genotype dosages); rows are observations.}
+
+\item{cor_thres}{Numeric in (0, 1). Absolute correlation threshold.
+Columns whose pairwise |cor| exceeds this are grouped; one survivor is
+kept per group. Default 0.8.}
+
+\item{verbose}{Logical. If TRUE, print progress messages. Default FALSE.}
+}
+\value{
+A list with:
+  \describe{
+    \item{X.new}{Matrix containing the retained columns of \code{X}.}
+    \item{filter.id}{Integer vector of the column indices of \code{X} that
+      were retained (in original order).}
+  }
+}
+\description{
+Performs single-linkage hierarchical clustering on a correlation-distance
+matrix (1 - |cor(X)|) and keeps one representative column per cluster at the
+given correlation threshold. Uses \code{Rfast::cora} when available for a
+faster correlation computation on wide matrices.
+}
+\examples{
+set.seed(1)
+X <- matrix(rnorm(100 * 5), 100, 5)
+X[, 2] <- X[, 1] + rnorm(100, sd = 0.01)   # near-duplicate of col 1
+res <- ld_prune_by_correlation(X, cor_thres = 0.9)
+ncol(res$X.new)
+
+}
diff --git a/pixi.toml b/pixi.toml
index 384c6b3c..5e92dfd2 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -48,6 +48,7 @@ r45 = {features = ["r45"]}
 "bioconductor-snpstats" = "*"
 "r-base" = "*"
 "r-bglr" = "*"
+"r-bigsnpr" = "*"
 "r-coda" = "*"
 "r-coloc" = "*"
 "r-colocboost" = "*"
diff --git a/tests/testthat/test_ld_prune.R b/tests/testthat/test_ld_prune.R
new file mode 100644
index 00000000..a1c90231
--- /dev/null
+++ b/tests/testthat/test_ld_prune.R
@@ -0,0 +1,272 @@
+library(testthat)
+
+# ===========================================================================
+# ld_prune_by_correlation
+# ===========================================================================
+
+test_that("ld_prune_by_correlation removes highly correlated columns", {
+  set.seed(42)
+  n <- 50; p <- 10
+  X <- matrix(rnorm(n * p), nrow = n)
+  colnames(X) <- paste0("v", 1:p)
+  X[, 2] <- X[, 1] + rnorm(n, sd = 0.01)
+  result <- ld_prune_by_correlation(X, cor_thres = 0.9)
+  expect_true(ncol(result$X.new) < p)
+  expect_equal(length(result$filter.id), ncol(result$X.new))
+})
+
+test_that("ld_prune_by_correlation keeps all columns when uncorrelated", {
+  set.seed(42)
+  n <- 100; p <- 5
+  X <- matrix(rnorm(n * p), nrow = n)
+  colnames(X) <- paste0("v", 1:p)
+  result <- ld_prune_by_correlation(X, cor_thres = 0.99)
+  expect_equal(ncol(result$X.new), p)
+  expect_equal(result$filter.id, 1:p)
+})
+
+test_that("ld_prune_by_correlation preserves colnames for single remaining column", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n * 3), nrow = n)
+  colnames(X) <- c("a", "b", "c")
+  X[, 2] <- X[, 1] + rnorm(n, sd = 0.001)
+  X[, 3] <- X[, 1] + rnorm(n, sd = 0.001)
+  result <- ld_prune_by_correlation(X, cor_thres = 0.5)
+  expect_true(ncol(result$X.new) >= 1)
+  expect_true(!is.null(colnames(result$X.new)))
+})
+
+test_that("ld_prune_by_correlation errors on single-column input", {
+  set.seed(42)
+  n <- 30
+  X <- matrix(rnorm(n), nrow = n, ncol = 1)
+  colnames(X) <- "v1"
+  expect_error(ld_prune_by_correlation(X, cor_thres = 0.8))
+})
+
+test_that("ld_prune_by_correlation strict threshold removes at least as many as lenient", {
+  set.seed(42)
+  n <- 100; p <- 5
+  X <- matrix(rnorm(n * p), nrow = n)
+  colnames(X) <- paste0("v", 1:p)
+  X[, 2] <- X[, 1] + rnorm(n, sd = 0.1)
+  X[, 3] <- X[, 1] + rnorm(n, sd = 0.1)
+  X[, 5] <- X[, 4] + rnorm(n, sd = 0.1)
+  result_strict <- ld_prune_by_correlation(X, cor_thres = 0.3)
+  result_lenient <- ld_prune_by_correlation(X, cor_thres = 0.99)
+  expect_true(ncol(result_strict$X.new) <= ncol(result_lenient$X.new))
+})
+
+test_that("ld_prune_by_correlation preserves colnames when no columns deleted", {
+  set.seed(42)
+  n <- 100; p <- 3
+  X <- matrix(rnorm(n * p), nrow = n)
+  colnames(X) <- c("snp_a", "snp_b", "snp_c")
+  result <- ld_prune_by_correlation(X, cor_thres = 0.999)
+  expect_equal(colnames(result$X.new), colnames(X))
+})
+
+test_that("ld_prune_by_correlation is silent by default, chatty with verbose", {
+  set.seed(1)
+  X <- matrix(rnorm(100), 20, 5)
+  colnames(X) <- paste0("v", 1:5)
+  X[, 2] <- X[, 1] + rnorm(20, sd = 1e-3)
+  expect_silent(ld_prune_by_correlation(X, cor_thres = 0.9))
+  expect_message(ld_prune_by_correlation(X, cor_thres = 0.9, verbose = TRUE),
+                 "ld_prune_by_correlation")
+})
+
+# ===========================================================================
+# drop_collinear_columns
+# ===========================================================================
+
+# drop_collinear_columns and enforce_design_full_rank are unexported helpers;
+# access them via pecotmr::: in these tests.
+
+test_that("drop_collinear_columns returns X unchanged when problematic_cols is empty", {
+  X <- matrix(rnorm(100), nrow = 20, ncol = 5)
+  colnames(X) <- paste0("v", 1:5)
+  result <- pecotmr:::drop_collinear_columns(X, problematic_cols = character(0), strategy = "correlation")
+  expect_equal(ncol(result), 5)
+})
+
+test_that("drop_collinear_columns removes single problematic column", {
+  X <- matrix(rnorm(100), nrow = 20, ncol = 5)
+  colnames(X) <- paste0("v", 1:5)
+  result <- pecotmr:::drop_collinear_columns(X, problematic_cols = "v3", strategy = "correlation")
+  expect_equal(ncol(result), 4)
+  expect_false("v3" %in% colnames(result))
+})
+
+test_that("drop_collinear_columns variance strategy removes lowest variance column", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n * 3), nrow = n, ncol = 3)
+  colnames(X) <- c("low_var", "mid_var", "high_var")
+  X[, 1] <- X[, 1] * 0.01
+  X[, 3] <- X[, 3] * 10
+  result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("low_var", "mid_var", "high_var"),
+                                             strategy = "variance")
+  expect_equal(ncol(result), 2)
+  expect_false("low_var" %in% colnames(result))
+})
+
+test_that("drop_collinear_columns correlation strategy with two columns removes one", {
+  set.seed(42)
+  X <- matrix(rnorm(100), nrow = 20, ncol = 5)
+  colnames(X) <- paste0("v", 1:5)
+  result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2"), strategy = "correlation")
+  expect_equal(ncol(result), 4)
+})
+
+test_that("drop_collinear_columns correlation strategy with 3+ cols removes highest sum", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n * 4), nrow = n, ncol = 4)
+  colnames(X) <- paste0("v", 1:4)
+  X[, 2] <- X[, 1] + rnorm(n, sd = 0.01)
+  X[, 3] <- X[, 1] + rnorm(n, sd = 0.01)
+  result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2", "v3"),
+                                             strategy = "correlation")
+  expect_equal(ncol(result), 3)
+})
+
+test_that("drop_collinear_columns response_correlation strategy removes lowest |cor| with response", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n * 3), nrow = n, ncol = 3)
+  colnames(X) <- c("v1", "v2", "v3")
+  response <- X[, 1] * 2 + rnorm(n, sd = 0.1)
+  result <- pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2", "v3"),
+                                             strategy = "response_correlation",
+                                             response = response)
+  expect_equal(ncol(result), 2)
+  expect_true("v1" %in% colnames(result))
+})
+
+test_that("drop_collinear_columns errors on response_correlation without response", {
+  X <- matrix(rnorm(60), 20, 3)
+  colnames(X) <- c("v1", "v2", "v3")
+  expect_error(
+    pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2"),
+                                     strategy = "response_correlation"),
+    "response"
+  )
+})
+
+test_that("drop_collinear_columns errors on invalid strategy", {
+  X <- matrix(rnorm(100), nrow = 20, ncol = 5)
+  colnames(X) <- paste0("v", 1:5)
+  expect_error(
+    pecotmr:::drop_collinear_columns(X, problematic_cols = c("v1", "v2"),
+                                     strategy = "invalid_strategy"),
+    "arg"
+  )
+})
+
+test_that("drop_collinear_columns preserves column name when single column remains", {
+  set.seed(42)
+  X <- matrix(rnorm(40), nrow = 20, ncol = 2)
+  colnames(X) <- c("keeper", "removed")
+  result <- pecotmr:::drop_collinear_columns(X, problematic_cols = "removed", strategy = "correlation")
+  expect_equal(ncol(result), 1)
+  expect_equal(colnames(result), "keeper")
+})
+
+test_that("drop_collinear_columns is silent by default", {
+  X <- matrix(rnorm(40), 20, 2)
+  colnames(X) <- c("a", "b")
+  expect_silent(pecotmr:::drop_collinear_columns(X, problematic_cols = "b", strategy = "correlation"))
+})
+
+# ===========================================================================
+# enforce_design_full_rank (unexported)
+# ===========================================================================
+
+test_that("enforce_design_full_rank returns full-rank matrix when already full rank", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n * 3), nrow = n, ncol = 3)
+  colnames(X) <- paste0("v", 1:3)
+  C <- matrix(rnorm(n * 2), nrow = n, ncol = 2)
+  result <- enforce_design_full_rank(X = X, C = C, strategy = "correlation")
+  expect_true(is.matrix(result))
+  expect_true(ncol(result) >= 1)
+})
+
+test_that("enforce_design_full_rank handles rank-deficient design via correlation fallback", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n * 4), nrow = n, ncol = 4)
+  colnames(X) <- paste0("v", 1:4)
+  X[, 4] <- X[, 1] + X[, 2]
+  result <- enforce_design_full_rank(X = X, C = NULL, strategy = "correlation")
+  design <- cbind(1, result)
+  expect_equal(qr(design)$rank, ncol(design))
+})
+
+test_that("enforce_design_full_rank preserves colname for single-column input", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n), nrow = n, ncol = 1)
+  colnames(X) <- "only_snp"
+  result <- enforce_design_full_rank(X = X, C = NULL, strategy = "correlation")
+  expect_equal(colnames(result), "only_snp")
+})
+
+test_that("enforce_design_full_rank is silent by default", {
+  set.seed(42)
+  n <- 50
+  X <- matrix(rnorm(n * 3), n, 3)
+  colnames(X) <- paste0("v", 1:3)
+  expect_silent(enforce_design_full_rank(X = X, C = NULL, strategy = "correlation"))
+})
+
+# ===========================================================================
+# ld_clump_by_score
+# ===========================================================================
+
+test_that("ld_clump_by_score skips clumping on single-column input", {
+  skip_if_not_installed("bigsnpr")
+  skip_if_not_installed("bigstatsr")
+  set.seed(1)
+  X <- matrix(rbinom(100, 2, 0.3), 100, 1)
+  colnames(X) <- "chr1:100:A:G"
+  keep <- ld_clump_by_score(X, score = 1.0, chr = 1L, pos = 100L, r2 = 0.2)
+  expect_equal(keep, 1L)
+})
+
+test_that("ld_clump_by_score validates input lengths", {
+  skip_if_not_installed("bigsnpr")
+  skip_if_not_installed("bigstatsr")
+  set.seed(1)
+  X <- matrix(rbinom(100 * 3, 2, 0.3), 100, 3)
+  colnames(X) <- paste0("chr1:", seq_len(3) * 1000, ":A:G")
+  expect_error(
+    ld_clump_by_score(X, score = 1:2, chr = rep(1L, 3), pos = seq_len(3) * 1000L),
+    "score"
+  )
+  expect_error(
+    ld_clump_by_score(X, score = 1:3, chr = rep(1L, 2), pos = seq_len(3) * 1000L),
+    "chr and pos"
+  )
+})
+
+test_that("ld_clump_by_score returns indices on real data", {
+  skip_if_not_installed("bigsnpr")
+  skip_if_not_installed("bigstatsr")
+  set.seed(1)
+  n <- 500; p <- 20
+  X <- matrix(rbinom(n * p, 2, 0.3), n, p)
+  colnames(X) <- paste0("chr1:", seq_len(p) * 1000, ":A:G")
+  # Introduce perfect LD between variants 1 and 2
+  X[, 2] <- X[, 1]
+  score <- c(2, 1, runif(p - 2))
+  chr <- rep(1L, p)
+  pos <- seq_len(p) * 1000L
+  keep <- ld_clump_by_score(X, score = score, chr = chr, pos = pos, r2 = 0.2)
+  expect_true(1L %in% keep)
+  expect_false(2L %in% keep)   # pruned: same as variant 1 but lower score
+  expect_true(length(keep) < p)
+})
diff --git a/tests/testthat/test_twas_pipeline_helpers.R b/tests/testthat/test_twas_pipeline_helpers.R
new file mode 100644
index 00000000..5f01bfa5
--- /dev/null
+++ b/tests/testthat/test_twas_pipeline_helpers.R
@@ -0,0 +1,58 @@
+library(testthat)
+
+# build_twas_score_row is an unexported contract helper shared between
+# twas_pipeline (pecotmr) and quantile_twas_pipeline (qQTLR). Tests cover the
+# shapes returned by twas_analysis() so both pipelines stay aligned.
+
+test_that("build_twas_score_row returns empty data.frame for NULL input", {
+  out <- pecotmr:::build_twas_score_row(NULL,
+                                        weight_db = "ENSG001",
+                                        context   = "Cortex",
+                                        study     = "AD")
+  expect_s3_class(out, "data.frame")
+  expect_equal(nrow(out), 0)
+})
+
+test_that("build_twas_score_row packs a single-method twas_rs", {
+  # Shape mimics twas_analysis() output: apply(weights, 2, twas_z) returns
+  # a named list where each element is list(z=, pval=). The name's final
+  # "_<suffix>" is the method marker (see the sub() regex).
+  # find_data(twas_rs, c(2, "z")) descends one level and pulls out "z".
+  twas_rs <- list(
+    enet_weights = list(z = 2.5, pval = 0.012)
+  )
+  out <- pecotmr:::build_twas_score_row(twas_rs,
+                                        weight_db = "ENSG001",
+                                        context   = "Cortex",
+                                        study     = "AD")
+  expect_s3_class(out, "data.frame")
+  expect_equal(nrow(out), 1)
+  expect_setequal(colnames(out),
+                  c("gwas_study", "method", "twas_z", "twas_pval",
+                    "context", "molecular_id"))
+  expect_equal(out$gwas_study,   "AD")
+  expect_equal(out$method,       "enet")
+  expect_equal(out$twas_z,       2.5)
+  expect_equal(out$twas_pval,    0.012)
+  expect_equal(out$context,      "Cortex")
+  expect_equal(out$molecular_id, "ENSG001")
+})
+
+test_that("build_twas_score_row packs a multi-method twas_rs", {
+  twas_rs <- list(
+    enet_weights  = list(z =  2.5, pval = 0.012),
+    lasso_weights = list(z = -1.8, pval = 0.072),
+    top_weights   = list(z =  0.4, pval = 0.689)
+  )
+  out <- pecotmr:::build_twas_score_row(twas_rs,
+                                        weight_db = "ENSG002",
+                                        context   = "Liver",
+                                        study     = "T2D")
+  expect_equal(nrow(out), 3)
+  expect_equal(out$method,       c("enet", "lasso", "top"))
+  expect_equal(out$twas_z,       c(2.5, -1.8, 0.4))
+  expect_equal(out$twas_pval,    c(0.012, 0.072, 0.689))
+  expect_true(all(out$gwas_study   == "T2D"))
+  expect_true(all(out$context      == "Liver"))
+  expect_true(all(out$molecular_id == "ENSG002"))
+})