From 6bf86840f2c582353a38be3175ff021e8e2aa0ae Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 7 May 2026 06:28:33 -0400 Subject: [PATCH 1/4] Add sldsc wrapper --- DESCRIPTION | 2 +- R/sldsc_wrapper.R | 326 +++++++++++++++++++++++++++++++++++++++ tests/testthat/test_LD.R | 58 +++++++ 3 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 R/sldsc_wrapper.R diff --git a/DESCRIPTION b/DESCRIPTION index ca1a7691..035eb535 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -68,4 +68,4 @@ LinkingTo: RcppArmadillo NeedsCompilation: yes VignetteBuilder: knitr -RoxygenNote: 7.3.3 +Config/roxygen2/version: 8.0.0 diff --git a/R/sldsc_wrapper.R b/R/sldsc_wrapper.R new file mode 100644 index 00000000..d33ff165 --- /dev/null +++ b/R/sldsc_wrapper.R @@ -0,0 +1,326 @@ +# Stratified LD Score Regression (S-LDSC) post-processing wrappers around polyfun. +# +# This file provides the post-processing layer for the xqtl-protocol sLDSC pipeline: +# read polyfun outputs per trait, compute Gazal-style standardized tau* and +# differential per-SNP heritability (EnrichStat), and run DerSimonian-Laird +# random-effects meta-analysis across traits. +# +# Reference panel convention: all LD-derived quantities (baseline LD scores, +# target LD scores, regression weights, allele frequencies) must come from the +# same reference panel. Do not mix files from different panels (e.g. 1000G vs ADSP). +# +# MAF convention: by default we restrict to MAF > 5% per the sLDSC recommendation. +# Pass maf_cutoff = 0 to opt out (not recommended). +# +# Cross-type comparison: tau* (Gazal et al. 2017 standardization) is the +# cross-type comparable statistic. Use tau* to rank or meta-analyze annotations +# that mix binary and continuous types. E (proportion-based enrichment) is +# scale-dependent for continuous annotations and is only comparable within type. + +#' @title Read S-LDSC outputs from polyfun for one trait/run +#' +#' @description Reads the regression outputs produced by `polyfun/ldsc.py` for a +#' single polyfun run (one trait, one annotation set) and returns them as a +#' tidy list ready for downstream standardization. Hides the underlying file +#' formats; downstream code consumes only modeling quantities. +#' +#' @param prefix Character. Path prefix to the polyfun outputs for one trait/run. +#' The function appends `.results`, `.log`, and `.part_delete` to this prefix. +#' Example: `"/path/to/cwd/CAD_META.filtered.sumstats.gz"`. +#' +#' @return A named list with components: +#' \describe{ +#' \item{categories}{Character vector of annotation names, in regression order +#' (target annotations first, baseline last by convention).} +#' \item{tau}{Numeric named vector of regression coefficients \eqn{\tau_C}.} +#' \item{tau_se}{Numeric named vector of standard errors of \eqn{\tau_C}.} +#' \item{enrichment}{Numeric named vector of \eqn{E_C = \pi^{h^2}_C / \pi^M_C}.} +#' \item{enrichment_se}{Numeric named vector of standard errors of \eqn{E_C}.} +#' \item{enrichment_p}{Numeric named vector of p-values for the differential +#' per-SNP heritability test (the EnrichStat p-value reported by polyfun).} +#' \item{prop_h2}{Numeric named vector \eqn{\pi^{h^2}_C}.} +#' \item{prop_snps}{Numeric named vector \eqn{\pi^M_C}.} +#' \item{h2g}{Scalar total trait heritability \eqn{h^2_g}.} +#' \item{tau_blocks}{Numeric matrix of per-block jackknife \eqn{\tau} values, +#' dimensions (n_blocks x n_annotations), columns named by category.} +#' \item{n_blocks}{Integer number of jackknife blocks (typically 200).} +#' } +#' +#' @examples +#' \dontrun{ +#' run <- read_sldsc_trait("/output/CAD_META.filtered.sumstats.gz") +#' run$tau["my_target_annotation"] +#' } +#' +#' @export +read_sldsc_trait <- function(prefix) { + stop("Not yet implemented (skeleton).") +} + + +#' @title Compute per-annotation standard deviation, MAF-restricted +#' +#' @description Computes the standard deviation of each annotation column in the +#' target annotation files, restricted to SNPs above a MAF cutoff. The MAF +#' restriction is required for the standardization to be internally consistent +#' with polyfun's regression, which operates on MAF > cutoff SNPs by default. +#' +#' @param target_anno_dir Character. Directory containing target annotation files +#' (one per chromosome) in polyfun's `.annot.gz` format. +#' @param frqfile_dir Character or NULL. Directory containing PLINK `.frq` files +#' for the reference panel. Required when `maf_cutoff > 0`; the function +#' errors if missing. +#' @param plink_name Character. Filename prefix of the `.frq` files +#' (e.g. `"ADSP_chr"`). Files are expected at `{plink_name}{chr}.frq`. +#' @param maf_cutoff Numeric, default `0.05`. Only SNPs with `MAF > maf_cutoff` +#' contribute to sd. Set to `0` to disable filtering (must match the +#' `--not-M-5-50` regression mode for internal consistency). +#' @param annot_cols Character or integer vector, default NULL. Annotation columns +#' to compute sd for. If NULL, all annotation columns (i.e. all columns past +#' the standard fixed columns CHR/SNP/BP/CM/A1/A2/MAF) are used. +#' +#' @return Named numeric vector of \eqn{sd_C} values, one per annotation. +#' +#' @examples +#' \dontrun{ +#' sd_annot <- compute_sldsc_annot_sd( +#' target_anno_dir = "/output/ldscores/AC_DeJager_eQTL", +#' frqfile_dir = "/ref/ADSP/frq", +#' plink_name = "ADSP_chr", +#' maf_cutoff = 0.05 +#' ) +#' } +#' +#' @export +compute_sldsc_annot_sd <- function(target_anno_dir, frqfile_dir = NULL, + plink_name = "ADSP_chr", + maf_cutoff = 0.05, annot_cols = NULL) { + stop("Not yet implemented (skeleton).") +} + + +#' @title Reference-panel SNP count at a given MAF cutoff +#' +#' @description Returns the number of SNPs in the reference panel above the MAF +#' cutoff, matching the regression's M_5_50 (when `maf_cutoff > 0`) or M +#' (when `maf_cutoff == 0`). Used as \eqn{M_{ref}} in the standardization. +#' +#' @param target_anno_dir Character. Directory containing per-chromosome +#' `.l2.M_5_50` (when MAF-filtered) or `.l2.M` (when not) files produced by +#' polyfun's `compute_ldscores.py` for the target annotation. +#' @param maf_cutoff Numeric, default `0.05`. +#' +#' @return Scalar integer: total number of SNPs in the reference panel above the cutoff. +#' +#' @export +compute_sldsc_M_ref <- function(target_anno_dir, maf_cutoff = 0.05) { + stop("Not yet implemented (skeleton).") +} + + +#' @title Detect whether each annotation is binary or continuous +#' +#' @description Inspects each annotation column in the target annotation files +#' and returns whether its values lie in \{0, 1\} (binary) or take other values +#' (continuous). Used to select the appropriate within-type headline statistic. +#' For cross-type comparison, always use \eqn{\tau^*_C} regardless of the flag. +#' +#' @param target_anno_dir Character. Directory containing the target `.annot.gz` +#' files (one per chromosome). +#' @param annot_cols Character or integer vector, default NULL. +#' +#' @return Named logical vector: TRUE for binary, FALSE for continuous. +#' +#' @export +is_binary_sldsc_annot <- function(target_anno_dir, annot_cols = NULL) { + stop("Not yet implemented (skeleton).") +} + + +#' @title Standardize tau and compute per-block tau* and EnrichStat for one polyfun run +#' +#' @description Given the polyfun read result plus the annotation sd's and +#' reference SNP count, computes the Gazal-style standardized tau (\eqn{\tau^*}) +#' and the differential per-SNP heritability statistic (EnrichStat) for the +#' target annotations, including their per-block jackknife values for use in +#' cross-trait meta-analysis. +#' +#' @details The standardization is +#' \deqn{\tau^*_C = \tau_C \cdot sd_C \cdot M_{ref} / h^2_g} +#' applied both to the point estimate and to each of the n_blocks jackknife +#' blocks. The EnrichStat is +#' \deqn{\frac{h^2_g}{M_{ref}}\left[\frac{\pi^{h^2}_C}{\pi^M_C} +#' - \frac{1-\pi^{h^2}_C}{1-\pi^M_C}\right]} +#' computed from the per-block tau values, with jackknife SE from the per-block +#' variance \eqn{\sqrt{\frac{(B-1)^2}{B} Var_b}}. +#' +#' Only target annotations are returned; baseline annotations are dropped after +#' serving their role of conditioning the regression. +#' +#' @param trait_data List. Output of \code{\link{read_sldsc_trait}} for one polyfun run. +#' @param sd_annot Named numeric vector. Output of +#' \code{\link{compute_sldsc_annot_sd}}, restricted to target annotations. +#' @param M_ref Scalar. Output of \code{\link{compute_sldsc_M_ref}}. +#' @param target_categories Character vector or NULL. Annotation names to keep. +#' If NULL, all categories with names matching `names(sd_annot)` are kept. +#' +#' @return A list with components: +#' \describe{ +#' \item{summary}{Data frame with one row per target annotation and columns: +#' `target`, `tau`, `tau_se`, `tau_star`, `tau_star_se`, `enrichment`, +#' `enrichment_se`, `enrichment_p`, `enrichstat`, `enrichstat_se`.} +#' \item{tau_star_blocks}{Matrix (n_blocks x n_target) of per-block \eqn{\tau^*_C}.} +#' \item{enrichstat_blocks}{Matrix (n_blocks x n_target) of per-block EnrichStat.} +#' \item{h2g}{Scalar trait heritability.} +#' \item{n_blocks}{Integer.} +#' } +#' +#' @export +standardize_sldsc_trait <- function(trait_data, sd_annot, M_ref, + target_categories = NULL) { + stop("Not yet implemented (skeleton).") +} + + +#' @title Random-effects meta-analysis of S-LDSC quantities across traits +#' +#' @description DerSimonian-Laird random-effects meta-analysis of one S-LDSC +#' quantity across multiple traits, for one annotation. Used as the entry +#' point both for the default pipeline meta and for user-driven re-meta over +#' subsets of traits. +#' +#' @details Implements +#' \deqn{\hat\theta_{meta} = \sum_i w_i \hat\theta_i / \sum_i w_i, +#' \quad SE_{meta} = 1/\sqrt{\sum_i w_i}, +#' \quad w_i = 1/(SE_i^2 + \hat\sigma^2)} +#' via `rmeta::meta.summaries(..., method = "random")`. The two-sided p-value +#' is computed from the meta z-score under the standard normal. +#' +#' `quantity = "tau_star"` and `"enrichstat"` use the jackknife SE from +#' per-block delete values (computed inside +#' \code{\link{standardize_sldsc_trait}}). `quantity = "enrichment"` uses +#' the SE reported directly by the regression engine. +#' +#' @param per_trait_estimates Named list of standardized per-trait results from +#' \code{\link{standardize_sldsc_trait}}. Pass a subset to re-meta on a +#' user-chosen group of traits. +#' @param category Character. Annotation name to meta-analyze. +#' @param quantity Character, one of `"tau_star"`, `"enrichment"`, or +#' `"enrichstat"`. +#' +#' @return A list with components: `mean`, `se`, `p`, `n_traits`, +#' `traits_used`, and `tau2` (the DerSimonian-Laird between-trait variance). +#' +#' @examples +#' \dontrun{ +#' meta_all <- meta_sldsc_random(per_trait_results, "my_anno", "tau_star") +#' +#' # subset to a custom trait group +#' subset <- per_trait_results[c("CAD_META", "AD_GWAX", "PD_meta")] +#' meta_neuro <- meta_sldsc_random(subset, "my_anno", "tau_star") +#' } +#' +#' @export +meta_sldsc_random <- function(per_trait_estimates, category, + quantity = c("tau_star", "enrichment", "enrichstat")) { + stop("Not yet implemented (skeleton).") +} + + +#' @title End-to-end S-LDSC post-processing across traits, single + joint in one pass +#' +#' @description Top-level convenience wrapper. Reads polyfun outputs (single-tau +#' runs and the joint-tau run) for each trait, standardizes both modes, and +#' runs the default random-effects meta-analysis across all traits supplied. +#' Single and joint analyses are produced in one pass; no `joint_tau` flag. +#' +#' @details For a set of \eqn{N} target annotations and one trait, the caller is +#' expected to have already produced \eqn{N+1} polyfun runs: \eqn{N} single-tau +#' runs (each fitting one target + baseline) and 1 joint-tau run (fitting all +#' \eqn{N} targets + baseline together). The pipeline reads all of these, drops +#' baseline categories, standardizes both modes, and assembles consolidated +#' per-trait and meta data frames. +#' +#' Reports baseline annotation count and names via `message()` once at the +#' start of the run for transparency. +#' +#' Cross-type comparison: the `meta$tau_star` frame is the apple-to-apple table +#' for ranking annotations regardless of binary/continuous type. The `is_binary` +#' column lets callers filter further when within-type comparison is desired. +#' +#' @param trait_single_prefixes Named list. For each trait (name = trait id), +#' a character vector of length \eqn{N} giving the polyfun output prefixes for +#' the \eqn{N} single-tau runs (one per target annotation). Order must match +#' `target_categories`. +#' @param trait_joint_prefix Named character. For each trait (name = trait id, +#' matching `trait_single_prefixes`), the polyfun output prefix for the joint-tau +#' run that fits all \eqn{N} targets together. +#' @param target_anno_dir Character. Directory of the target annotation files +#' (used for `sd_C` and binary detection). When all \eqn{N} targets share +#' one directory (multi-column annot files), pass that. When each target has +#' its own directory, pass a named character vector (names = target categories). +#' @param frqfile_dir Character or NULL. Directory of `.frq` files for the panel. +#' @param plink_name Character. Filename prefix of `.frq` files (default `"ADSP_chr"`). +#' @param maf_cutoff Numeric, default `0.05`. +#' @param target_categories Character vector or NULL. Target annotation names to +#' report. If NULL, auto-detected from the joint-run results as all categories +#' not in baseline. +#' +#' @return A list with components: +#' \describe{ +#' \item{per_trait}{Named list of per-trait results. For each trait, a list with: +#' `summary` (wide data frame of single + joint estimates side-by-side per +#' target annotation, with `is_binary` flag), `tau_star_blocks_single`, +#' `tau_star_blocks_joint`, `enrichstat_blocks_single`, +#' `enrichstat_blocks_joint` (all matrices), and `h2g`.} +#' \item{meta}{List of three data frames, each with one row per target annotation: +#' \itemize{ +#' \item `tau_star`: columns `target`, `is_binary`, `single_mean`, +#' `single_se`, `single_p`, `joint_mean`, `joint_se`, `joint_p`, `n_traits`. +#' \item `enrichment`: same shape. +#' \item `enrichstat`: same shape. +#' } +#' The `tau_star` frame is the cross-type comparable headline.} +#' \item{params}{List echoing `maf_cutoff`, `M_ref`, `target_categories`, +#' `n_baseline`, `baseline_categories`, and `trait_names` for reproducibility.} +#' } +#' +#' @examples +#' \dontrun{ +#' res <- sldsc_postprocessing_pipeline( +#' trait_single_prefixes = list( +#' CAD_META = c("/output/CAD_META.single_anno1", "/output/CAD_META.single_anno2"), +#' AD_GWAX = c("/output/AD_GWAX.single_anno1", "/output/AD_GWAX.single_anno2") +#' ), +#' trait_joint_prefix = c( +#' CAD_META = "/output/CAD_META.joint", +#' AD_GWAX = "/output/AD_GWAX.joint" +#' ), +#' target_anno_dir = "/output/ldscores/my_targets", +#' frqfile_dir = "/ref/ADSP/frq", +#' plink_name = "ADSP_chr", +#' maf_cutoff = 0.05, +#' target_categories = c("anno1", "anno2") +#' ) +#' res$meta$tau_star # cross-type comparable headline +#' res$meta$enrichment # within-binary headline +#' +#' # later, re-meta on a custom subset of traits +#' meta_neuro <- meta_sldsc_random( +#' res$per_trait[c("AD_GWAX", "PD_meta")], "anno1", "tau_star" +#' ) +#' } +#' +#' @seealso \code{\link{read_sldsc_trait}}, \code{\link{standardize_sldsc_trait}}, +#' \code{\link{meta_sldsc_random}} +#' +#' @export +sldsc_postprocessing_pipeline <- function(trait_single_prefixes, + trait_joint_prefix, + target_anno_dir, + frqfile_dir = NULL, + plink_name = "ADSP_chr", + maf_cutoff = 0.05, + target_categories = NULL) { + stop("Not yet implemented (skeleton).") +} diff --git a/tests/testthat/test_LD.R b/tests/testthat/test_LD.R index ea6a8d7b..058c6460 100644 --- a/tests/testthat/test_LD.R +++ b/tests/testthat/test_LD.R @@ -823,3 +823,61 @@ test_that("can_merge checks chromosome and size", { b3 <- data.frame(chrom = "2", size = 100) expect_false(pecotmr:::can_merge(b1, b3, max_size = 500)) }) + +# ---- check_ld ---- +test_that("check_ld detects positive definite matrix", { + R <- diag(5) + result <- check_ld(R, method = "check") + expect_true(result$is_pd) + expect_true(result$is_psd) + expect_equal(result$n_negative, 0) + expect_equal(result$min_eigenvalue, 1) + expect_equal(result$method_applied, "none") +}) + +test_that("check_ld detects non-PSD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + result <- check_ld(R, method = "check") + expect_false(result$is_psd) + expect_true(result$n_negative > 0) + expect_true(result$min_eigenvalue < 0) + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) +}) + +test_that("check_ld eigenfix repairs non-PSD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + result <- check_ld(R, method = "eigenfix") + expect_equal(result$method_applied, "eigenfix") + result2 <- check_ld(result$R, method = "check") + expect_true(result2$is_psd) + expect_equal(diag(result$R), rep(1, 3)) + expect_true(isSymmetric(result$R)) +}) + +test_that("check_ld shrink repairs non-PD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + result <- check_ld(R, method = "shrink", shrinkage = 0.1) + expect_equal(result$method_applied, "shrink") + result2 <- check_ld(result$R, method = "check") + expect_true(result2$is_pd) +}) + +test_that("check_ld eigenfix is no-op on PSD matrix", { + R <- diag(5) + result <- check_ld(R, method = "eigenfix") + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) +}) + +test_that("check_ld condition_number is correct", { + R <- diag(c(1, 0.5, 0.1)) + result <- check_ld(R, method = "check") + expect_equal(result$condition_number, 10) +}) From 4d0b7afbb59a2e5162a566b1462ee9409f401429 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 7 May 2026 07:25:03 -0400 Subject: [PATCH 2/4] documentation update --- R/sldsc_wrapper.R | 175 +++++++++++++++++++++++++--------------------- 1 file changed, 96 insertions(+), 79 deletions(-) diff --git a/R/sldsc_wrapper.R b/R/sldsc_wrapper.R index d33ff165..62d96943 100644 --- a/R/sldsc_wrapper.R +++ b/R/sldsc_wrapper.R @@ -1,8 +1,8 @@ # Stratified LD Score Regression (S-LDSC) post-processing wrappers around polyfun. # # This file provides the post-processing layer for the xqtl-protocol sLDSC pipeline: -# read polyfun outputs per trait, compute Gazal-style standardized tau* and -# differential per-SNP heritability (EnrichStat), and run DerSimonian-Laird +# read polyfun outputs per trait, compute Gazal-style standardized tau* and the +# differential per-SNP heritability statistic (EnrichStat), and run DerSimonian-Laird # random-effects meta-analysis across traits. # # Reference panel convention: all LD-derived quantities (baseline LD scores, @@ -30,8 +30,8 @@ #' #' @return A named list with components: #' \describe{ -#' \item{categories}{Character vector of annotation names, in regression order -#' (target annotations first, baseline last by convention).} +#' \item{categories}{Character vector of annotation names in regression order +#' (target annotations first, baseline last by polyfun convention).} #' \item{tau}{Numeric named vector of regression coefficients \eqn{\tau_C}.} #' \item{tau_se}{Numeric named vector of standard errors of \eqn{\tau_C}.} #' \item{enrichment}{Numeric named vector of \eqn{E_C = \pi^{h^2}_C / \pi^M_C}.} @@ -40,7 +40,7 @@ #' per-SNP heritability test (the EnrichStat p-value reported by polyfun).} #' \item{prop_h2}{Numeric named vector \eqn{\pi^{h^2}_C}.} #' \item{prop_snps}{Numeric named vector \eqn{\pi^M_C}.} -#' \item{h2g}{Scalar total trait heritability \eqn{h^2_g}.} +#' \item{h2g}{Scalar total trait heritability \eqn{h^2_g}, parsed from the log.} #' \item{tau_blocks}{Numeric matrix of per-block jackknife \eqn{\tau} values, #' dimensions (n_blocks x n_annotations), columns named by category.} #' \item{n_blocks}{Integer number of jackknife blocks (typically 200).} @@ -65,6 +65,12 @@ read_sldsc_trait <- function(prefix) { #' restriction is required for the standardization to be internally consistent #' with polyfun's regression, which operates on MAF > cutoff SNPs by default. #' +#' @details Implementation walks per-chromosome `.annot.gz` files, joins them +#' with PLINK `.frq` files when `maf_cutoff > 0`, and pools across chromosomes +#' with the standard pooled-variance formula +#' \deqn{sd_C = \sqrt{\sum_{\text{chr}} (n_{\text{chr}} - 1)\,\mathrm{Var}(C_{\text{chr}}) \,/\, \sum_{\text{chr}} (n_{\text{chr}} - 1)}} +#' over the MAF-restricted SNPs only. +#' #' @param target_anno_dir Character. Directory containing target annotation files #' (one per chromosome) in polyfun's `.annot.gz` format. #' @param frqfile_dir Character or NULL. Directory containing PLINK `.frq` files @@ -73,24 +79,14 @@ read_sldsc_trait <- function(prefix) { #' @param plink_name Character. Filename prefix of the `.frq` files #' (e.g. `"ADSP_chr"`). Files are expected at `{plink_name}{chr}.frq`. #' @param maf_cutoff Numeric, default `0.05`. Only SNPs with `MAF > maf_cutoff` -#' contribute to sd. Set to `0` to disable filtering (must match the -#' `--not-M-5-50` regression mode for internal consistency). +#' contribute to sd. Set to `0` to disable (must match the regression's +#' `--not-M-5-50` mode). #' @param annot_cols Character or integer vector, default NULL. Annotation columns #' to compute sd for. If NULL, all annotation columns (i.e. all columns past #' the standard fixed columns CHR/SNP/BP/CM/A1/A2/MAF) are used. #' #' @return Named numeric vector of \eqn{sd_C} values, one per annotation. #' -#' @examples -#' \dontrun{ -#' sd_annot <- compute_sldsc_annot_sd( -#' target_anno_dir = "/output/ldscores/AC_DeJager_eQTL", -#' frqfile_dir = "/ref/ADSP/frq", -#' plink_name = "ADSP_chr", -#' maf_cutoff = 0.05 -#' ) -#' } -#' #' @export compute_sldsc_annot_sd <- function(target_anno_dir, frqfile_dir = NULL, plink_name = "ADSP_chr", @@ -102,12 +98,14 @@ compute_sldsc_annot_sd <- function(target_anno_dir, frqfile_dir = NULL, #' @title Reference-panel SNP count at a given MAF cutoff #' #' @description Returns the number of SNPs in the reference panel above the MAF -#' cutoff, matching the regression's M_5_50 (when `maf_cutoff > 0`) or M -#' (when `maf_cutoff == 0`). Used as \eqn{M_{ref}} in the standardization. +#' cutoff, used as \eqn{M_{ref}} in the standardization. When +#' `maf_cutoff > 0` the function sums `.l2.M_5_50` files (matching polyfun's +#' regression which uses M_5_50 with `--frqfile-chr`); when `maf_cutoff == 0` +#' it sums `.l2.M` files (matching the `--not-M-5-50` regression mode). #' #' @param target_anno_dir Character. Directory containing per-chromosome #' `.l2.M_5_50` (when MAF-filtered) or `.l2.M` (when not) files produced by -#' polyfun's `compute_ldscores.py` for the target annotation. +#' polyfun's `compute_ldscores.py`. #' @param maf_cutoff Numeric, default `0.05`. #' #' @return Scalar integer: total number of SNPs in the reference panel above the cutoff. @@ -137,47 +135,57 @@ is_binary_sldsc_annot <- function(target_anno_dir, annot_cols = NULL) { } -#' @title Standardize tau and compute per-block tau* and EnrichStat for one polyfun run +#' @title Standardize tau and compute EnrichStat for one polyfun run #' #' @description Given the polyfun read result plus the annotation sd's and #' reference SNP count, computes the Gazal-style standardized tau (\eqn{\tau^*}) #' and the differential per-SNP heritability statistic (EnrichStat) for the -#' target annotations, including their per-block jackknife values for use in -#' cross-trait meta-analysis. +#' target annotations. #' -#' @details The standardization is +#' @details Standardization: #' \deqn{\tau^*_C = \tau_C \cdot sd_C \cdot M_{ref} / h^2_g} #' applied both to the point estimate and to each of the n_blocks jackknife -#' blocks. The EnrichStat is -#' \deqn{\frac{h^2_g}{M_{ref}}\left[\frac{\pi^{h^2}_C}{\pi^M_C} -#' - \frac{1-\pi^{h^2}_C}{1-\pi^M_C}\right]} -#' computed from the per-block tau values, with jackknife SE from the per-block -#' variance \eqn{\sqrt{\frac{(B-1)^2}{B} Var_b}}. +#' blocks (the per-block \eqn{\tau^*_C} values are returned for use as inputs +#' to cross-trait random-effects meta). +#' +#' EnrichStat point estimate: +#' \deqn{\text{EnrichStat}_C = \frac{h^2_g}{M_{ref}}\!\left[\frac{\pi^{h^2}_C}{\pi^M_C} - \frac{1-\pi^{h^2}_C}{1-\pi^M_C}\right]} +#' computed once from polyfun's reported \eqn{\pi^{h^2}_C} and \eqn{\pi^M_C}. +#' Its standard error is recovered from polyfun's `Enrichment_p` via +#' \deqn{|Z_C| = \Phi^{-1}(1 - p_C/2), \qquad SE_{\text{EnrichStat}_C} = |\text{EnrichStat}_C| / |Z_C|.} #' #' Only target annotations are returned; baseline annotations are dropped after -#' serving their role of conditioning the regression. +#' serving their role of conditioning the regression. Following the original +#' pipeline scope, EnrichStat / E are computed only when the call corresponds +#' to a single-target run; in joint-mode runs only \eqn{\tau^*_C} is produced +#' per target. #' #' @param trait_data List. Output of \code{\link{read_sldsc_trait}} for one polyfun run. #' @param sd_annot Named numeric vector. Output of #' \code{\link{compute_sldsc_annot_sd}}, restricted to target annotations. #' @param M_ref Scalar. Output of \code{\link{compute_sldsc_M_ref}}. #' @param target_categories Character vector or NULL. Annotation names to keep. -#' If NULL, all categories with names matching `names(sd_annot)` are kept. +#' If NULL, all categories matching `names(sd_annot)` are kept. +#' @param mode Character, one of `"single"` or `"joint"`. Controls which +#' quantities are produced (E and EnrichStat are produced only for `"single"`; +#' \eqn{\tau^*_C} is produced for both). #' #' @return A list with components: #' \describe{ -#' \item{summary}{Data frame with one row per target annotation and columns: -#' `target`, `tau`, `tau_se`, `tau_star`, `tau_star_se`, `enrichment`, -#' `enrichment_se`, `enrichment_p`, `enrichstat`, `enrichstat_se`.} +#' \item{summary}{Data frame with one row per target annotation. Columns: +#' `target`, `tau`, `tau_se`, `tau_star`, `tau_star_se`. For `mode="single"` +#' additionally `enrichment`, `enrichment_se`, `enrichment_p`, +#' `enrichstat`, `enrichstat_se` (the back-solved SE).} #' \item{tau_star_blocks}{Matrix (n_blocks x n_target) of per-block \eqn{\tau^*_C}.} -#' \item{enrichstat_blocks}{Matrix (n_blocks x n_target) of per-block EnrichStat.} #' \item{h2g}{Scalar trait heritability.} #' \item{n_blocks}{Integer.} +#' \item{mode}{Echo of the input mode.} #' } #' #' @export standardize_sldsc_trait <- function(trait_data, sd_annot, M_ref, - target_categories = NULL) { + target_categories = NULL, + mode = c("single", "joint")) { stop("Not yet implemented (skeleton).") } @@ -194,22 +202,30 @@ standardize_sldsc_trait <- function(trait_data, sd_annot, M_ref, #' \quad SE_{meta} = 1/\sqrt{\sum_i w_i}, #' \quad w_i = 1/(SE_i^2 + \hat\sigma^2)} #' via `rmeta::meta.summaries(..., method = "random")`. The two-sided p-value -#' is computed from the meta z-score under the standard normal. -#' -#' `quantity = "tau_star"` and `"enrichstat"` use the jackknife SE from -#' per-block delete values (computed inside -#' \code{\link{standardize_sldsc_trait}}). `quantity = "enrichment"` uses -#' the SE reported directly by the regression engine. +#' is computed from the meta z-score under the standard normal: +#' \eqn{p = 2\Phi(-|Z_{meta}|)}. +#' +#' Per-trait \eqn{SE_i} sources: +#' - `quantity = "tau_star"`: jackknife SE from the per-block \eqn{\tau^*} +#' values (computed in \code{\link{standardize_sldsc_trait}}). +#' - `quantity = "enrichment"`: polyfun-reported `Enrichment_std_error`. +#' - `quantity = "enrichstat"`: back-solved SE from polyfun's `Enrichment_p`, +#' \eqn{SE = |\text{EnrichStat}|/|\Phi^{-1}(1-p/2)|}. +#' +#' For binary-annotation enrichment reporting, callers typically combine two +#' meta runs: effect size and SE from `quantity="enrichment"` (interpretable +#' on the original enrichment-fold scale), p-value from `quantity="enrichstat"` +#' (the appropriate hypothesis test). The top-level +#' \code{\link{sldsc_postprocessing_pipeline}} performs this combination. #' #' @param per_trait_estimates Named list of standardized per-trait results from #' \code{\link{standardize_sldsc_trait}}. Pass a subset to re-meta on a #' user-chosen group of traits. #' @param category Character. Annotation name to meta-analyze. -#' @param quantity Character, one of `"tau_star"`, `"enrichment"`, or -#' `"enrichstat"`. +#' @param quantity Character, one of `"tau_star"`, `"enrichment"`, or `"enrichstat"`. #' -#' @return A list with components: `mean`, `se`, `p`, `n_traits`, -#' `traits_used`, and `tau2` (the DerSimonian-Laird between-trait variance). +#' @return A list with components: `mean`, `se`, `p`, `n_traits`, `traits_used`, +#' and `tau2` (the DerSimonian-Laird between-trait variance). #' #' @examples #' \dontrun{ @@ -229,56 +245,57 @@ meta_sldsc_random <- function(per_trait_estimates, category, #' @title End-to-end S-LDSC post-processing across traits, single + joint in one pass #' -#' @description Top-level convenience wrapper. Reads polyfun outputs (single-tau -#' runs and the joint-tau run) for each trait, standardizes both modes, and -#' runs the default random-effects meta-analysis across all traits supplied. -#' Single and joint analyses are produced in one pass; no `joint_tau` flag. +#' @description Top-level convenience wrapper. Reads polyfun outputs (one +#' single-target run per target plus one joint run per trait), standardizes +#' both modes, and runs the default random-effects meta-analysis across all +#' traits supplied. Single-target and joint-target analyses are produced in +#' one pass. #' -#' @details For a set of \eqn{N} target annotations and one trait, the caller is -#' expected to have already produced \eqn{N+1} polyfun runs: \eqn{N} single-tau -#' runs (each fitting one target + baseline) and 1 joint-tau run (fitting all -#' \eqn{N} targets + baseline together). The pipeline reads all of these, drops +#' @details For \eqn{N} target annotations and one trait, the caller is expected +#' to have already produced \eqn{N+1} polyfun runs: \eqn{N} single-target runs +#' (each fitting one target + baseline) and 1 joint run (fitting all \eqn{N} +#' targets + baseline together). The pipeline reads all of these, drops #' baseline categories, standardizes both modes, and assembles consolidated #' per-trait and meta data frames. #' #' Reports baseline annotation count and names via `message()` once at the -#' start of the run for transparency. +#' start of the run. +#' +#' Cross-type comparison: the `meta$tau_star` frame is the apple-to-apple +#' table for ranking annotations regardless of binary/continuous type. The +#' `is_binary` column lets callers filter when within-type comparison is desired. #' -#' Cross-type comparison: the `meta$tau_star` frame is the apple-to-apple table -#' for ranking annotations regardless of binary/continuous type. The `is_binary` -#' column lets callers filter further when within-type comparison is desired. +#' Two-channel enrichment meta (binary annotations only): the `meta$enrichment` +#' frame's `single_mean` and `single_se` come from the meta on \eqn{E_C}; +#' the `single_p` comes from the meta on EnrichStat (the appropriate +#' hypothesis test). #' #' @param trait_single_prefixes Named list. For each trait (name = trait id), -#' a character vector of length \eqn{N} giving the polyfun output prefixes for -#' the \eqn{N} single-tau runs (one per target annotation). Order must match -#' `target_categories`. -#' @param trait_joint_prefix Named character. For each trait (name = trait id, -#' matching `trait_single_prefixes`), the polyfun output prefix for the joint-tau -#' run that fits all \eqn{N} targets together. -#' @param target_anno_dir Character. Directory of the target annotation files -#' (used for `sd_C` and binary detection). When all \eqn{N} targets share -#' one directory (multi-column annot files), pass that. When each target has -#' its own directory, pass a named character vector (names = target categories). -#' @param frqfile_dir Character or NULL. Directory of `.frq` files for the panel. -#' @param plink_name Character. Filename prefix of `.frq` files (default `"ADSP_chr"`). +#' a character vector of length \eqn{N} giving the polyfun output prefixes +#' for the \eqn{N} single-target runs. Order must match `target_categories`. +#' @param trait_joint_prefix Named character. For each trait (name matching +#' `trait_single_prefixes`), the polyfun output prefix for the joint run. +#' @param target_anno_dir Character. Directory of target annotation files used +#' for `sd_C` and binary detection. Typically the joint-mode directory, since +#' it carries all target annotation columns. +#' @param frqfile_dir Character or NULL. +#' @param plink_name Character. Default `"ADSP_chr"`. #' @param maf_cutoff Numeric, default `0.05`. -#' @param target_categories Character vector or NULL. Target annotation names to -#' report. If NULL, auto-detected from the joint-run results as all categories -#' not in baseline. +#' @param target_categories Character vector or NULL. Auto-detected from the +#' joint-run results as all categories not in baseline if NULL. #' #' @return A list with components: #' \describe{ #' \item{per_trait}{Named list of per-trait results. For each trait, a list with: #' `summary` (wide data frame of single + joint estimates side-by-side per -#' target annotation, with `is_binary` flag), `tau_star_blocks_single`, -#' `tau_star_blocks_joint`, `enrichstat_blocks_single`, -#' `enrichstat_blocks_joint` (all matrices), and `h2g`.} +#' target annotation, with `is_binary` flag), `tau_star_blocks_single` +#' (matrix), `tau_star_blocks_joint` (matrix), and `h2g`.} #' \item{meta}{List of three data frames, each with one row per target annotation: #' \itemize{ #' \item `tau_star`: columns `target`, `is_binary`, `single_mean`, #' `single_se`, `single_p`, `joint_mean`, `joint_se`, `joint_p`, `n_traits`. -#' \item `enrichment`: same shape. -#' \item `enrichstat`: same shape. +#' \item `enrichment`: same shape; effect/SE from E meta, p from EnrichStat meta. +#' \item `enrichstat`: same shape; pure EnrichStat meta. #' } #' The `tau_star` frame is the cross-type comparable headline.} #' \item{params}{List echoing `maf_cutoff`, `M_ref`, `target_categories`, @@ -303,7 +320,7 @@ meta_sldsc_random <- function(per_trait_estimates, category, #' target_categories = c("anno1", "anno2") #' ) #' res$meta$tau_star # cross-type comparable headline -#' res$meta$enrichment # within-binary headline +#' res$meta$enrichment # within-binary headline (effect/SE from E, p from EnrichStat) #' #' # later, re-meta on a custom subset of traits #' meta_neuro <- meta_sldsc_random( From eaff5433d1e6bbc2c68f415024b7372772ef2933 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 7 May 2026 08:41:05 -0400 Subject: [PATCH 3/4] Implement core functions --- R/sldsc_wrapper.R | 812 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 601 insertions(+), 211 deletions(-) diff --git a/R/sldsc_wrapper.R b/R/sldsc_wrapper.R index 62d96943..b8e17c80 100644 --- a/R/sldsc_wrapper.R +++ b/R/sldsc_wrapper.R @@ -17,6 +17,22 @@ # that mix binary and continuous types. E (proportion-based enrichment) is # scale-dependent for continuous annotations and is only comparable within type. +# ---- internal helpers ---- + +.sldsc_std_cols <- c("CHR", "SNP", "BP", "CM", "A1", "A2", "MAF") + +.sldsc_chrom_from_filename <- function(f) { + bn <- basename(f) + m <- regmatches(bn, regexec("\\.([0-9]+)\\.annot\\.gz$", bn))[[1]] + if (length(m) >= 2) as.integer(m[2]) else NA_integer_ +} + +.sldsc_detect_annot_cols <- function(file_path) { + sample <- data.table::fread(file_path, nrows = 5L) + setdiff(names(sample), .sldsc_std_cols) +} + + #' @title Read S-LDSC outputs from polyfun for one trait/run #' #' @description Reads the regression outputs produced by `polyfun/ldsc.py` for a @@ -28,23 +44,7 @@ #' The function appends `.results`, `.log`, and `.part_delete` to this prefix. #' Example: `"/path/to/cwd/CAD_META.filtered.sumstats.gz"`. #' -#' @return A named list with components: -#' \describe{ -#' \item{categories}{Character vector of annotation names in regression order -#' (target annotations first, baseline last by polyfun convention).} -#' \item{tau}{Numeric named vector of regression coefficients \eqn{\tau_C}.} -#' \item{tau_se}{Numeric named vector of standard errors of \eqn{\tau_C}.} -#' \item{enrichment}{Numeric named vector of \eqn{E_C = \pi^{h^2}_C / \pi^M_C}.} -#' \item{enrichment_se}{Numeric named vector of standard errors of \eqn{E_C}.} -#' \item{enrichment_p}{Numeric named vector of p-values for the differential -#' per-SNP heritability test (the EnrichStat p-value reported by polyfun).} -#' \item{prop_h2}{Numeric named vector \eqn{\pi^{h^2}_C}.} -#' \item{prop_snps}{Numeric named vector \eqn{\pi^M_C}.} -#' \item{h2g}{Scalar total trait heritability \eqn{h^2_g}, parsed from the log.} -#' \item{tau_blocks}{Numeric matrix of per-block jackknife \eqn{\tau} values, -#' dimensions (n_blocks x n_annotations), columns named by category.} -#' \item{n_blocks}{Integer number of jackknife blocks (typically 200).} -#' } +#' @return A named list. See `sldsc_postprocessing_pipeline` for components. #' #' @examples #' \dontrun{ @@ -52,24 +52,58 @@ #' run$tau["my_target_annotation"] #' } #' +#' @importFrom data.table fread +#' @importFrom stats setNames var #' @export read_sldsc_trait <- function(prefix) { - stop("Not yet implemented (skeleton).") + results_file <- paste0(prefix, ".results") + log_file <- paste0(prefix, ".log") + delete_file <- paste0(prefix, ".part_delete") + + for (f in c(results_file, log_file, delete_file)) { + if (!file.exists(f)) stop("read_sldsc_trait: missing file: ", f) + } + + results <- data.table::fread(results_file) + cats <- as.character(results$Category) + + log_lines <- readLines(log_file, warn = FALSE) + h2_line <- grep("Total Observed scale h2:", log_lines, value = TRUE) + if (length(h2_line) == 0L) + stop("read_sldsc_trait: could not find 'Total Observed scale h2:' in ", log_file) + h2g <- suppressWarnings(as.numeric(gsub(".*h2: (-?[0-9.eE+-]+).*", "\\1", h2_line[1]))) + if (is.na(h2g)) + stop("read_sldsc_trait: failed to parse h2g numeric from log line: ", h2_line[1]) + + delete_values <- as.matrix(data.table::fread(delete_file)) + if (ncol(delete_values) != length(cats)) { + stop("read_sldsc_trait: .part_delete has ", ncol(delete_values), + " columns but .results has ", length(cats), " categories.") + } + colnames(delete_values) <- cats + + list( + categories = cats, + tau = stats::setNames(as.numeric(results$Coefficient), cats), + tau_se = stats::setNames(as.numeric(results[["Coefficient_std_error"]]), cats), + enrichment = stats::setNames(as.numeric(results$Enrichment), cats), + enrichment_se = stats::setNames(as.numeric(results[["Enrichment_std_error"]]), cats), + enrichment_p = stats::setNames(as.numeric(results[["Enrichment_p"]]), cats), + prop_h2 = stats::setNames(as.numeric(results[["Prop._h2"]]), cats), + prop_snps = stats::setNames(as.numeric(results[["Prop._SNPs"]]), cats), + h2g = h2g, + tau_blocks = delete_values, + n_blocks = nrow(delete_values) + ) } #' @title Compute per-annotation standard deviation, MAF-restricted #' #' @description Computes the standard deviation of each annotation column in the -#' target annotation files, restricted to SNPs above a MAF cutoff. The MAF -#' restriction is required for the standardization to be internally consistent -#' with polyfun's regression, which operates on MAF > cutoff SNPs by default. -#' -#' @details Implementation walks per-chromosome `.annot.gz` files, joins them -#' with PLINK `.frq` files when `maf_cutoff > 0`, and pools across chromosomes -#' with the standard pooled-variance formula -#' \deqn{sd_C = \sqrt{\sum_{\text{chr}} (n_{\text{chr}} - 1)\,\mathrm{Var}(C_{\text{chr}}) \,/\, \sum_{\text{chr}} (n_{\text{chr}} - 1)}} -#' over the MAF-restricted SNPs only. +#' target annotation files, restricted to SNPs above a MAF cutoff via PLINK +#' `.frq` files. Required for internal consistency with polyfun's regression, +#' which operates on MAF > cutoff SNPs by default. #' #' @param target_anno_dir Character. Directory containing target annotation files #' (one per chromosome) in polyfun's `.annot.gz` format. @@ -78,50 +112,139 @@ read_sldsc_trait <- function(prefix) { #' errors if missing. #' @param plink_name Character. Filename prefix of the `.frq` files #' (e.g. `"ADSP_chr"`). Files are expected at `{plink_name}{chr}.frq`. -#' @param maf_cutoff Numeric, default `0.05`. Only SNPs with `MAF > maf_cutoff` -#' contribute to sd. Set to `0` to disable (must match the regression's -#' `--not-M-5-50` mode). +#' @param maf_cutoff Numeric, default `0.05`. #' @param annot_cols Character or integer vector, default NULL. Annotation columns -#' to compute sd for. If NULL, all annotation columns (i.e. all columns past -#' the standard fixed columns CHR/SNP/BP/CM/A1/A2/MAF) are used. +#' to compute sd for. If NULL, all annotation columns are used. #' #' @return Named numeric vector of \eqn{sd_C} values, one per annotation. #' +#' @importFrom data.table fread +#' @importFrom stats setNames var #' @export compute_sldsc_annot_sd <- function(target_anno_dir, frqfile_dir = NULL, plink_name = "ADSP_chr", maf_cutoff = 0.05, annot_cols = NULL) { - stop("Not yet implemented (skeleton).") + if (maf_cutoff > 0 && (is.null(frqfile_dir) || !dir.exists(frqfile_dir))) { + stop("compute_sldsc_annot_sd: maf_cutoff = ", maf_cutoff, + " requires frqfile_dir, but '", frqfile_dir, "' is not a directory.") + } + if (!dir.exists(target_anno_dir)) { + stop("compute_sldsc_annot_sd: target_anno_dir does not exist: ", target_anno_dir) + } + + anno_files <- list.files(target_anno_dir, pattern = "\\.annot\\.gz$", full.names = TRUE) + if (length(anno_files) == 0L) + stop("compute_sldsc_annot_sd: no .annot.gz files in: ", target_anno_dir) + + detected <- .sldsc_detect_annot_cols(anno_files[1]) + if (is.null(annot_cols)) { + cols_use <- detected + } else if (is.numeric(annot_cols)) { + cols_use <- detected[annot_cols] + } else { + cols_use <- annot_cols + } + if (length(cols_use) == 0L) + stop("compute_sldsc_annot_sd: no annotation columns to process.") + + num <- stats::setNames(numeric(length(cols_use)), cols_use) + den <- 0 + + for (anno_file in anno_files) { + dat <- data.table::fread(anno_file) + if (maf_cutoff > 0) { + chrom <- .sldsc_chrom_from_filename(anno_file) + if (is.na(chrom)) + stop("compute_sldsc_annot_sd: could not parse chromosome from: ", anno_file) + frq_file <- file.path(frqfile_dir, paste0(plink_name, chrom, ".frq")) + if (!file.exists(frq_file)) + stop("compute_sldsc_annot_sd: .frq file not found: ", frq_file) + frq <- data.table::fread(frq_file, select = c("SNP", "MAF")) + dat <- merge(dat, frq, by = "SNP", all.x = FALSE, all.y = FALSE) + dat <- dat[!is.na(dat$MAF) & dat$MAF > maf_cutoff, ] + } + if (nrow(dat) <= 1L) next + n_minus_1 <- nrow(dat) - 1L + for (col in cols_use) { + vals <- as.numeric(dat[[col]]) + v <- stats::var(vals, na.rm = TRUE) + if (!is.na(v)) num[col] <- num[col] + n_minus_1 * v + } + den <- den + n_minus_1 + } + + if (den <= 0) + stop("compute_sldsc_annot_sd: zero degrees of freedom after MAF filtering.") + sqrt(num / den) } #' @title Reference-panel SNP count at a given MAF cutoff #' #' @description Returns the number of SNPs in the reference panel above the MAF -#' cutoff, used as \eqn{M_{ref}} in the standardization. When -#' `maf_cutoff > 0` the function sums `.l2.M_5_50` files (matching polyfun's -#' regression which uses M_5_50 with `--frqfile-chr`); when `maf_cutoff == 0` -#' it sums `.l2.M` files (matching the `--not-M-5-50` regression mode). -#' -#' @param target_anno_dir Character. Directory containing per-chromosome -#' `.l2.M_5_50` (when MAF-filtered) or `.l2.M` (when not) files produced by -#' polyfun's `compute_ldscores.py`. +#' cutoff. When `maf_cutoff > 0`, counts MAF > cutoff entries across all +#' `.frq` files. When `maf_cutoff == 0`, counts rows of `.l2.ldscore` +#' files in `target_anno_dir`. +#' +#' @param target_anno_dir Character or NULL. Directory of `.l2.ldscore` files +#' produced by polyfun's `compute_ldscores.py`. Required when +#' `maf_cutoff == 0`. +#' @param frqfile_dir Character or NULL. Directory of PLINK `.frq` files. +#' Required when `maf_cutoff > 0`. +#' @param plink_name Character. Filename prefix of `.frq` files. #' @param maf_cutoff Numeric, default `0.05`. #' -#' @return Scalar integer: total number of SNPs in the reference panel above the cutoff. +#' @return Scalar integer. #' +#' @importFrom data.table fread #' @export -compute_sldsc_M_ref <- function(target_anno_dir, maf_cutoff = 0.05) { - stop("Not yet implemented (skeleton).") +compute_sldsc_M_ref <- function(target_anno_dir = NULL, frqfile_dir = NULL, + plink_name = "ADSP_chr", maf_cutoff = 0.05) { + if (maf_cutoff > 0) { + if (is.null(frqfile_dir) || !dir.exists(frqfile_dir)) + stop("compute_sldsc_M_ref: maf_cutoff = ", maf_cutoff, + " requires frqfile_dir.") + pat <- paste0("^", gsub("([.])", "\\\\\\1", plink_name), "[0-9]+\\.frq$") + frq_files <- list.files(frqfile_dir, pattern = pat, full.names = TRUE) + if (length(frq_files) == 0L) + frq_files <- list.files(frqfile_dir, pattern = "\\.frq$", full.names = TRUE) + if (length(frq_files) == 0L) + stop("compute_sldsc_M_ref: no .frq files found in: ", frqfile_dir) + + total <- 0L + for (f in frq_files) { + frq <- data.table::fread(f, select = "MAF") + total <- total + sum(!is.na(frq$MAF) & frq$MAF > maf_cutoff) + } + return(as.integer(total)) + } + + if (is.null(target_anno_dir) || !dir.exists(target_anno_dir)) + stop("compute_sldsc_M_ref: maf_cutoff = 0 requires target_anno_dir.") + files <- list.files(target_anno_dir, + pattern = "\\.l2\\.ldscore\\.(gz|parquet)$", + full.names = TRUE) + if (length(files) == 0L) + stop("compute_sldsc_M_ref: no .l2.ldscore files in: ", target_anno_dir) + + total <- 0L + for (f in files) { + if (endsWith(f, ".parquet")) { + if (!requireNamespace("arrow", quietly = TRUE)) + stop("compute_sldsc_M_ref: install 'arrow' to read .parquet files.") + total <- total + nrow(arrow::read_parquet(f)) + } else { + total <- total + nrow(data.table::fread(f)) + } + } + as.integer(total) } #' @title Detect whether each annotation is binary or continuous #' -#' @description Inspects each annotation column in the target annotation files -#' and returns whether its values lie in \{0, 1\} (binary) or take other values -#' (continuous). Used to select the appropriate within-type headline statistic. -#' For cross-type comparison, always use \eqn{\tau^*_C} regardless of the flag. +#' @description Inspects each annotation column and returns whether its values +#' lie in \{0, 1\} (binary) or take other values (continuous). #' #' @param target_anno_dir Character. Directory containing the target `.annot.gz` #' files (one per chromosome). @@ -129,207 +252,269 @@ compute_sldsc_M_ref <- function(target_anno_dir, maf_cutoff = 0.05) { #' #' @return Named logical vector: TRUE for binary, FALSE for continuous. #' +#' @importFrom data.table fread +#' @importFrom stats setNames #' @export is_binary_sldsc_annot <- function(target_anno_dir, annot_cols = NULL) { - stop("Not yet implemented (skeleton).") + anno_files <- list.files(target_anno_dir, pattern = "\\.annot\\.gz$", full.names = TRUE) + if (length(anno_files) == 0L) + stop("is_binary_sldsc_annot: no .annot.gz files in: ", target_anno_dir) + + detected <- .sldsc_detect_annot_cols(anno_files[1]) + if (is.null(annot_cols)) { + cols_use <- detected + } else if (is.numeric(annot_cols)) { + cols_use <- detected[annot_cols] + } else { + cols_use <- annot_cols + } + + is_binary <- stats::setNames(rep(TRUE, length(cols_use)), cols_use) + + for (f in anno_files) { + dat <- data.table::fread(f, select = cols_use) + for (col in cols_use) { + if (!is_binary[[col]]) next + vals <- unique(stats::na.omit(as.numeric(dat[[col]]))) + if (any(!(vals %in% c(0, 1)))) is_binary[[col]] <- FALSE + } + if (!any(is_binary)) break + } + + is_binary } #' @title Standardize tau and compute EnrichStat for one polyfun run #' -#' @description Given the polyfun read result plus the annotation sd's and -#' reference SNP count, computes the Gazal-style standardized tau (\eqn{\tau^*}) -#' and the differential per-SNP heritability statistic (EnrichStat) for the -#' target annotations. -#' -#' @details Standardization: -#' \deqn{\tau^*_C = \tau_C \cdot sd_C \cdot M_{ref} / h^2_g} -#' applied both to the point estimate and to each of the n_blocks jackknife -#' blocks (the per-block \eqn{\tau^*_C} values are returned for use as inputs -#' to cross-trait random-effects meta). +#' @description Applies the Gazal standardization +#' \eqn{\tau^*_C = \tau_C \cdot sd_C \cdot M_{ref} / h^2_g} to the point and +#' to each jackknife block. For `mode = "single"`, additionally computes +#' EnrichStat and back-solves its standard error from polyfun's reported +#' `Enrichment_p` using \eqn{|Z| = \Phi^{-1}(1 - p/2)}. #' -#' EnrichStat point estimate: -#' \deqn{\text{EnrichStat}_C = \frac{h^2_g}{M_{ref}}\!\left[\frac{\pi^{h^2}_C}{\pi^M_C} - \frac{1-\pi^{h^2}_C}{1-\pi^M_C}\right]} -#' computed once from polyfun's reported \eqn{\pi^{h^2}_C} and \eqn{\pi^M_C}. -#' Its standard error is recovered from polyfun's `Enrichment_p` via -#' \deqn{|Z_C| = \Phi^{-1}(1 - p_C/2), \qquad SE_{\text{EnrichStat}_C} = |\text{EnrichStat}_C| / |Z_C|.} +#' @param trait_data List from \code{\link{read_sldsc_trait}}. +#' @param sd_annot Named numeric vector from \code{\link{compute_sldsc_annot_sd}}. +#' @param M_ref Scalar from \code{\link{compute_sldsc_M_ref}}. +#' @param target_categories Character vector or NULL. If NULL, intersects +#' `trait_data$categories` with `names(sd_annot)`. +#' @param mode Character: `"single"` or `"joint"`. #' -#' Only target annotations are returned; baseline annotations are dropped after -#' serving their role of conditioning the regression. Following the original -#' pipeline scope, EnrichStat / E are computed only when the call corresponds -#' to a single-target run; in joint-mode runs only \eqn{\tau^*_C} is produced -#' per target. -#' -#' @param trait_data List. Output of \code{\link{read_sldsc_trait}} for one polyfun run. -#' @param sd_annot Named numeric vector. Output of -#' \code{\link{compute_sldsc_annot_sd}}, restricted to target annotations. -#' @param M_ref Scalar. Output of \code{\link{compute_sldsc_M_ref}}. -#' @param target_categories Character vector or NULL. Annotation names to keep. -#' If NULL, all categories matching `names(sd_annot)` are kept. -#' @param mode Character, one of `"single"` or `"joint"`. Controls which -#' quantities are produced (E and EnrichStat are produced only for `"single"`; -#' \eqn{\tau^*_C} is produced for both). -#' -#' @return A list with components: -#' \describe{ -#' \item{summary}{Data frame with one row per target annotation. Columns: -#' `target`, `tau`, `tau_se`, `tau_star`, `tau_star_se`. For `mode="single"` -#' additionally `enrichment`, `enrichment_se`, `enrichment_p`, -#' `enrichstat`, `enrichstat_se` (the back-solved SE).} -#' \item{tau_star_blocks}{Matrix (n_blocks x n_target) of per-block \eqn{\tau^*_C}.} -#' \item{h2g}{Scalar trait heritability.} -#' \item{n_blocks}{Integer.} -#' \item{mode}{Echo of the input mode.} -#' } +#' @return A list with `summary` (data frame), `tau_star_blocks` (matrix), +#' `h2g`, `n_blocks`, `mode`. #' +#' @importFrom stats qnorm var #' @export standardize_sldsc_trait <- function(trait_data, sd_annot, M_ref, target_categories = NULL, mode = c("single", "joint")) { - stop("Not yet implemented (skeleton).") + mode <- match.arg(mode) + if (is.null(target_categories)) + target_categories <- intersect(trait_data$categories, names(sd_annot)) + if (length(target_categories) == 0L) + stop("standardize_sldsc_trait: no target categories.") + + target_idx <- match(target_categories, trait_data$categories) + if (any(is.na(target_idx))) + stop("standardize_sldsc_trait: missing categories: ", + paste(target_categories[is.na(target_idx)], collapse = ", ")) + + h2g <- trait_data$h2g + sd_target <- as.numeric(sd_annot[target_categories]) + if (any(is.na(sd_target) | sd_target == 0)) + warning("standardize_sldsc_trait: zero/NA sd for some targets; tau* will be NA/0.") + + coef <- sd_target * M_ref / h2g + tau <- as.numeric(trait_data$tau[target_categories]) + tau_se <- as.numeric(trait_data$tau_se[target_categories]) + tau_star <- tau * coef + + blocks_target <- trait_data$tau_blocks[, target_idx, drop = FALSE] + tau_star_blocks <- sweep(blocks_target, 2L, coef, FUN = "*") + + B <- trait_data$n_blocks + jk_var <- apply(tau_star_blocks, 2L, function(x) stats::var(x, na.rm = TRUE)) + tau_star_se <- sqrt((B - 1)^2 / B * jk_var) + + summary_df <- data.frame( + target = target_categories, + tau = tau, + tau_se = tau_se, + tau_star = tau_star, + tau_star_se = tau_star_se, + stringsAsFactors = FALSE + ) + + if (mode == "single") { + enrich <- as.numeric(trait_data$enrichment[target_categories]) + enrich_se <- as.numeric(trait_data$enrichment_se[target_categories]) + enrich_p <- as.numeric(trait_data$enrichment_p[target_categories]) + p_h2 <- as.numeric(trait_data$prop_h2[target_categories]) + p_M <- as.numeric(trait_data$prop_snps[target_categories]) + + diff_ratio <- (p_h2 / p_M) - (1 - p_h2) / (1 - p_M) + enrichstat <- (h2g / M_ref) * diff_ratio + + abs_z <- stats::qnorm(1 - enrich_p / 2) + enrichstat_se <- abs(enrichstat) / abs_z + enrichstat_se[!is.finite(abs_z) | abs_z <= 0] <- NA_real_ + + summary_df$enrichment <- enrich + summary_df$enrichment_se <- enrich_se + summary_df$enrichment_p <- enrich_p + summary_df$enrichstat <- enrichstat + summary_df$enrichstat_se <- enrichstat_se + } + + list( + summary = summary_df, + tau_star_blocks = tau_star_blocks, + h2g = h2g, + n_blocks = B, + mode = mode + ) } #' @title Random-effects meta-analysis of S-LDSC quantities across traits #' #' @description DerSimonian-Laird random-effects meta-analysis of one S-LDSC -#' quantity across multiple traits, for one annotation. Used as the entry -#' point both for the default pipeline meta and for user-driven re-meta over -#' subsets of traits. +#' quantity for one annotation across multiple traits. #' -#' @details Implements -#' \deqn{\hat\theta_{meta} = \sum_i w_i \hat\theta_i / \sum_i w_i, -#' \quad SE_{meta} = 1/\sqrt{\sum_i w_i}, -#' \quad w_i = 1/(SE_i^2 + \hat\sigma^2)} -#' via `rmeta::meta.summaries(..., method = "random")`. The two-sided p-value -#' is computed from the meta z-score under the standard normal: -#' \eqn{p = 2\Phi(-|Z_{meta}|)}. -#' -#' Per-trait \eqn{SE_i} sources: -#' - `quantity = "tau_star"`: jackknife SE from the per-block \eqn{\tau^*} -#' values (computed in \code{\link{standardize_sldsc_trait}}). +#' @details Per-trait \eqn{SE_i} sources: +#' - `quantity = "tau_star"`: jackknife SE from per-block \eqn{\tau^*}. #' - `quantity = "enrichment"`: polyfun-reported `Enrichment_std_error`. -#' - `quantity = "enrichstat"`: back-solved SE from polyfun's `Enrichment_p`, -#' \eqn{SE = |\text{EnrichStat}|/|\Phi^{-1}(1-p/2)|}. -#' -#' For binary-annotation enrichment reporting, callers typically combine two -#' meta runs: effect size and SE from `quantity="enrichment"` (interpretable -#' on the original enrichment-fold scale), p-value from `quantity="enrichstat"` -#' (the appropriate hypothesis test). The top-level -#' \code{\link{sldsc_postprocessing_pipeline}} performs this combination. +#' - `quantity = "enrichstat"`: back-solved SE from polyfun's `Enrichment_p`. #' -#' @param per_trait_estimates Named list of standardized per-trait results from -#' \code{\link{standardize_sldsc_trait}}. Pass a subset to re-meta on a -#' user-chosen group of traits. +#' @param per_trait_estimates Named list of per-trait results (each with a +#' `summary` data frame). #' @param category Character. Annotation name to meta-analyze. -#' @param quantity Character, one of `"tau_star"`, `"enrichment"`, or `"enrichstat"`. +#' @param quantity Character: `"tau_star"`, `"enrichment"`, or `"enrichstat"`. #' -#' @return A list with components: `mean`, `se`, `p`, `n_traits`, `traits_used`, -#' and `tau2` (the DerSimonian-Laird between-trait variance). -#' -#' @examples -#' \dontrun{ -#' meta_all <- meta_sldsc_random(per_trait_results, "my_anno", "tau_star") -#' -#' # subset to a custom trait group -#' subset <- per_trait_results[c("CAD_META", "AD_GWAX", "PD_meta")] -#' meta_neuro <- meta_sldsc_random(subset, "my_anno", "tau_star") -#' } +#' @return List with `mean`, `se`, `p`, `n_traits`, `traits_used`, `tau2`. #' +#' @importFrom stats pnorm #' @export meta_sldsc_random <- function(per_trait_estimates, category, quantity = c("tau_star", "enrichment", "enrichstat")) { - stop("Not yet implemented (skeleton).") + quantity <- match.arg(quantity) + col_pairs <- list( + tau_star = c("tau_star", "tau_star_se"), + enrichment = c("enrichment", "enrichment_se"), + enrichstat = c("enrichstat", "enrichstat_se") + ) + cols <- col_pairs[[quantity]] + trait_names <- names(per_trait_estimates) + if (is.null(trait_names)) + trait_names <- as.character(seq_along(per_trait_estimates)) + + means <- numeric(0); ses <- numeric(0); used <- character(0) + for (i in seq_along(per_trait_estimates)) { + pt <- per_trait_estimates[[i]] + if (is.null(pt) || is.null(pt$summary)) next + df <- pt$summary + row <- df[df$target == category, , drop = FALSE] + if (nrow(row) == 0L) next + if (!all(cols %in% names(row))) next + m <- as.numeric(row[[cols[1]]])[1] + s <- as.numeric(row[[cols[2]]])[1] + if (is.na(m) || is.na(s) || !is.finite(s) || s <= 0) next + means <- c(means, m); ses <- c(ses, s); used <- c(used, trait_names[i]) + } + + if (length(means) < 2L) { + return(list(mean = NA_real_, se = NA_real_, p = NA_real_, + n_traits = length(means), traits_used = used, + tau2 = NA_real_)) + } + if (!requireNamespace("rmeta", quietly = TRUE)) + stop("meta_sldsc_random: install the 'rmeta' package.") + + meta <- rmeta::meta.summaries(means, ses, method = "random") + z <- meta$summary / meta$se.summary + p <- 2 * stats::pnorm(-abs(z)) + list( + mean = as.numeric(meta$summary), + se = as.numeric(meta$se.summary), + p = as.numeric(p), + n_traits = length(means), + traits_used = used, + tau2 = as.numeric(meta$tau2) + ) +} + + +# Internal helper: assemble a wide per-trait summary frame with single + joint +# columns side by side. +.sldsc_assemble_trait_summary <- function(single_df, joint_df, target_categories, + is_binary_vec) { + rows <- if (!is.null(single_df)) single_df$target else + if (!is.null(joint_df)) joint_df$target else target_categories + out <- data.frame(target = rows, + is_binary = unname(is_binary_vec[rows]), + stringsAsFactors = FALSE) + + add_cols <- function(out, src, suffix) { + cols_to_add <- c("tau", "tau_se", "tau_star", "tau_star_se", + "enrichment", "enrichment_se", "enrichment_p", + "enrichstat", "enrichstat_se") + for (c in cols_to_add) { + newcol <- paste0(c, "_", suffix) + if (!is.null(src) && c %in% names(src)) { + out[[newcol]] <- src[[c]][match(out$target, src$target)] + } else { + out[[newcol]] <- NA_real_ + } + } + out + } + out <- add_cols(out, single_df, "single") + out <- add_cols(out, joint_df, "joint") + out +} + + +# Internal helper: build a per-trait list view that meta_sldsc_random can read. +# Each list element has a $summary frame with the requested mode's columns +# renamed to the canonical names (tau_star, tau_star_se, enrichment, ...). +.sldsc_view_for_meta <- function(per_trait, suffix) { + lapply(per_trait, function(pt) { + if (is.null(pt$summary)) return(NULL) + df <- pt$summary + cols_have <- c("tau_star", "tau_star_se", "enrichment", "enrichment_se", + "enrichment_p", "enrichstat", "enrichstat_se") + src_cols <- paste0(cols_have, "_", suffix) + avail <- src_cols %in% names(df) + if (!any(avail)) return(NULL) + new_df <- data.frame(target = df$target, stringsAsFactors = FALSE) + for (k in seq_along(cols_have)) { + if (avail[k]) new_df[[cols_have[k]]] <- df[[src_cols[k]]] + } + list(summary = new_df) + }) } #' @title End-to-end S-LDSC post-processing across traits, single + joint in one pass #' -#' @description Top-level convenience wrapper. Reads polyfun outputs (one -#' single-target run per target plus one joint run per trait), standardizes -#' both modes, and runs the default random-effects meta-analysis across all -#' traits supplied. Single-target and joint-target analyses are produced in -#' one pass. -#' -#' @details For \eqn{N} target annotations and one trait, the caller is expected -#' to have already produced \eqn{N+1} polyfun runs: \eqn{N} single-target runs -#' (each fitting one target + baseline) and 1 joint run (fitting all \eqn{N} -#' targets + baseline together). The pipeline reads all of these, drops -#' baseline categories, standardizes both modes, and assembles consolidated -#' per-trait and meta data frames. -#' -#' Reports baseline annotation count and names via `message()` once at the -#' start of the run. -#' -#' Cross-type comparison: the `meta$tau_star` frame is the apple-to-apple -#' table for ranking annotations regardless of binary/continuous type. The -#' `is_binary` column lets callers filter when within-type comparison is desired. -#' -#' Two-channel enrichment meta (binary annotations only): the `meta$enrichment` -#' frame's `single_mean` and `single_se` come from the meta on \eqn{E_C}; -#' the `single_p` comes from the meta on EnrichStat (the appropriate -#' hypothesis test). -#' -#' @param trait_single_prefixes Named list. For each trait (name = trait id), -#' a character vector of length \eqn{N} giving the polyfun output prefixes -#' for the \eqn{N} single-target runs. Order must match `target_categories`. -#' @param trait_joint_prefix Named character. For each trait (name matching -#' `trait_single_prefixes`), the polyfun output prefix for the joint run. -#' @param target_anno_dir Character. Directory of target annotation files used -#' for `sd_C` and binary detection. Typically the joint-mode directory, since -#' it carries all target annotation columns. +#' @description Top-level orchestration. Reads polyfun outputs (one single-target +#' run per target plus, when available, one joint run per trait), standardizes +#' both modes, and runs the default random-effects meta across all traits. +#' +#' @param trait_single_prefixes Named list. For each trait, a character vector +#' of length \eqn{N} giving the polyfun output prefixes for the \eqn{N} +#' single-target runs (order must match `target_categories`). +#' @param trait_joint_prefix Named character. For each trait, the polyfun output +#' prefix for the joint run. Pass `NA` (or `""`) for a trait without a joint run. +#' @param target_anno_dir Character. Directory of target `.annot.gz` files used +#' for `sd_C` and binary detection (typically the joint-mode dir). #' @param frqfile_dir Character or NULL. #' @param plink_name Character. Default `"ADSP_chr"`. #' @param maf_cutoff Numeric, default `0.05`. #' @param target_categories Character vector or NULL. Auto-detected from the -#' joint-run results as all categories not in baseline if NULL. +#' first available run if NULL. #' -#' @return A list with components: -#' \describe{ -#' \item{per_trait}{Named list of per-trait results. For each trait, a list with: -#' `summary` (wide data frame of single + joint estimates side-by-side per -#' target annotation, with `is_binary` flag), `tau_star_blocks_single` -#' (matrix), `tau_star_blocks_joint` (matrix), and `h2g`.} -#' \item{meta}{List of three data frames, each with one row per target annotation: -#' \itemize{ -#' \item `tau_star`: columns `target`, `is_binary`, `single_mean`, -#' `single_se`, `single_p`, `joint_mean`, `joint_se`, `joint_p`, `n_traits`. -#' \item `enrichment`: same shape; effect/SE from E meta, p from EnrichStat meta. -#' \item `enrichstat`: same shape; pure EnrichStat meta. -#' } -#' The `tau_star` frame is the cross-type comparable headline.} -#' \item{params}{List echoing `maf_cutoff`, `M_ref`, `target_categories`, -#' `n_baseline`, `baseline_categories`, and `trait_names` for reproducibility.} -#' } -#' -#' @examples -#' \dontrun{ -#' res <- sldsc_postprocessing_pipeline( -#' trait_single_prefixes = list( -#' CAD_META = c("/output/CAD_META.single_anno1", "/output/CAD_META.single_anno2"), -#' AD_GWAX = c("/output/AD_GWAX.single_anno1", "/output/AD_GWAX.single_anno2") -#' ), -#' trait_joint_prefix = c( -#' CAD_META = "/output/CAD_META.joint", -#' AD_GWAX = "/output/AD_GWAX.joint" -#' ), -#' target_anno_dir = "/output/ldscores/my_targets", -#' frqfile_dir = "/ref/ADSP/frq", -#' plink_name = "ADSP_chr", -#' maf_cutoff = 0.05, -#' target_categories = c("anno1", "anno2") -#' ) -#' res$meta$tau_star # cross-type comparable headline -#' res$meta$enrichment # within-binary headline (effect/SE from E, p from EnrichStat) -#' -#' # later, re-meta on a custom subset of traits -#' meta_neuro <- meta_sldsc_random( -#' res$per_trait[c("AD_GWAX", "PD_meta")], "anno1", "tau_star" -#' ) -#' } -#' -#' @seealso \code{\link{read_sldsc_trait}}, \code{\link{standardize_sldsc_trait}}, -#' \code{\link{meta_sldsc_random}} +#' @return List with `per_trait`, `meta` (three frames), `params`. #' #' @export sldsc_postprocessing_pipeline <- function(trait_single_prefixes, @@ -339,5 +524,210 @@ sldsc_postprocessing_pipeline <- function(trait_single_prefixes, plink_name = "ADSP_chr", maf_cutoff = 0.05, target_categories = NULL) { - stop("Not yet implemented (skeleton).") + trait_names <- names(trait_single_prefixes) + if (is.null(trait_names)) + stop("sldsc_postprocessing_pipeline: trait_single_prefixes must be a named list.") + + message("[sldsc] Computing M_ref...") + M_ref <- compute_sldsc_M_ref(target_anno_dir = target_anno_dir, + frqfile_dir = frqfile_dir, + plink_name = plink_name, + maf_cutoff = maf_cutoff) + message(sprintf("[sldsc] M_ref = %d (MAF cutoff %g)", M_ref, maf_cutoff)) + + message("[sldsc] Computing per-annotation sd...") + sd_annot_full <- compute_sldsc_annot_sd(target_anno_dir = target_anno_dir, + frqfile_dir = frqfile_dir, + plink_name = plink_name, + maf_cutoff = maf_cutoff) + message(sprintf("[sldsc] sd computed for %d annotation columns", + length(sd_annot_full))) + + message("[sldsc] Detecting binary vs continuous annotations...") + is_binary_full <- is_binary_sldsc_annot(target_anno_dir = target_anno_dir) + + # Auto-detect target categories from a representative run. + if (is.null(target_categories)) { + pivot_run <- NULL + if (!is.null(trait_joint_prefix) && length(trait_joint_prefix) > 0) { + jp <- trait_joint_prefix[[1]] + if (is.character(jp) && length(jp) == 1L && !is.na(jp) && nzchar(jp)) { + pivot_run <- tryCatch(read_sldsc_trait(jp), error = function(e) NULL) + } + } + if (is.null(pivot_run) && + length(trait_single_prefixes) > 0L && + length(trait_single_prefixes[[1]]) > 0L) { + pivot_run <- tryCatch(read_sldsc_trait(trait_single_prefixes[[1]][1]), + error = function(e) NULL) + } + if (is.null(pivot_run)) + stop("sldsc_postprocessing_pipeline: cannot auto-detect target_categories.") + target_categories <- intersect(pivot_run$categories, names(sd_annot_full)) + message(sprintf("[sldsc] Auto-detected %d target categories", length(target_categories))) + } + + baseline_categories <- character(0) + if (!is.null(trait_joint_prefix) && length(trait_joint_prefix) > 0L) { + jp <- trait_joint_prefix[[1]] + if (is.character(jp) && length(jp) == 1L && !is.na(jp) && nzchar(jp)) { + pivot <- tryCatch(read_sldsc_trait(jp), error = function(e) NULL) + if (!is.null(pivot)) + baseline_categories <- setdiff(pivot$categories, target_categories) + } + } + if (length(baseline_categories) > 0L) { + msg_head <- paste(utils::head(baseline_categories, 5), collapse = ", ") + msg_tail <- if (length(baseline_categories) > 5) ", ..." else "" + message(sprintf("[sldsc] Detected %d baseline annotations: %s%s", + length(baseline_categories), msg_head, msg_tail)) + } else { + message("[sldsc] No baseline annotations detected (joint-run prefix missing or unreadable).") + } + + sd_annot <- sd_annot_full[target_categories] + is_binary <- if (length(is_binary_full) > 0L) is_binary_full[target_categories] else + stats::setNames(rep(FALSE, length(target_categories)), target_categories) + + message(sprintf("[sldsc] Standardizing %d traits...", length(trait_names))) + per_trait <- list() + + for (trait in trait_names) { + # ---- single-mode ---- + single_summaries <- list() + single_blocks <- list() + single_h2gs <- numeric(0) + sing_prefs <- trait_single_prefixes[[trait]] + for (i in seq_along(target_categories)) { + cat_name <- target_categories[i] + if (i > length(sing_prefs)) break + pref <- sing_prefs[i] + run <- tryCatch(read_sldsc_trait(pref), error = function(e) { + warning(sprintf("[sldsc] Failed to read single %s for %s: %s", + cat_name, trait, e$message)); NULL + }) + if (is.null(run)) next + std <- tryCatch( + standardize_sldsc_trait(run, sd_annot[cat_name], M_ref, + target_categories = cat_name, mode = "single"), + error = function(e) { + warning(sprintf("[sldsc] Failed to standardize single %s for %s: %s", + cat_name, trait, e$message)); NULL + }) + if (is.null(std)) next + single_summaries[[cat_name]] <- std$summary + single_blocks[[cat_name]] <- std$tau_star_blocks + single_h2gs <- c(single_h2gs, std$h2g) + } + single_df <- if (length(single_summaries) > 0L) + do.call(rbind, single_summaries) else NULL + if (!is.null(single_df)) rownames(single_df) <- NULL + blocks_single <- if (length(single_blocks) > 0L) do.call(cbind, single_blocks) else NULL + + # ---- joint-mode ---- + joint_df <- NULL + blocks_joint <- NULL + joint_h2g <- NA_real_ + n_blocks_trait <- NA_integer_ + if (!is.null(trait_joint_prefix) && trait %in% names(trait_joint_prefix)) { + jp <- trait_joint_prefix[[trait]] + if (is.character(jp) && length(jp) == 1L && !is.na(jp) && nzchar(jp)) { + run <- tryCatch(read_sldsc_trait(jp), error = function(e) { + warning(sprintf("[sldsc] Failed to read joint for %s: %s", + trait, e$message)); NULL + }) + if (!is.null(run)) { + std <- tryCatch( + standardize_sldsc_trait(run, sd_annot, M_ref, + target_categories = target_categories, + mode = "joint"), + error = function(e) { + warning(sprintf("[sldsc] Failed to standardize joint for %s: %s", + trait, e$message)); NULL + }) + if (!is.null(std)) { + joint_df <- std$summary + blocks_joint <- std$tau_star_blocks + joint_h2g <- std$h2g + n_blocks_trait <- std$n_blocks + } + } + } + } + + summary_wide <- .sldsc_assemble_trait_summary(single_df, joint_df, + target_categories, is_binary) + per_trait[[trait]] <- list( + summary = summary_wide, + tau_star_blocks_single = blocks_single, + tau_star_blocks_joint = blocks_joint, + h2g = if (!is.na(joint_h2g)) joint_h2g + else if (length(single_h2gs) > 0L) median(single_h2gs) + else NA_real_, + n_blocks = n_blocks_trait + ) + } + + message("[sldsc] Running random-effects meta across traits...") + pt_view_single <- .sldsc_view_for_meta(per_trait, "single") + pt_view_joint <- .sldsc_view_for_meta(per_trait, "joint") + + build_table <- function(quantity, view, label) { + rows <- list() + for (cat in target_categories) { + m <- meta_sldsc_random(view, cat, quantity) + rows[[cat]] <- data.frame( + target = cat, + is_binary = unname(is_binary[cat]), + mean = m$mean, + se = m$se, + p = m$p, + n_traits = m$n_traits, + stringsAsFactors = FALSE + ) + } + df <- do.call(rbind, rows) + rownames(df) <- NULL + nm_old <- c("mean", "se", "p") + nm_new <- paste0(label, "_", nm_old) + names(df)[names(df) %in% nm_old] <- nm_new + df + } + + meta_tau_star_single <- build_table("tau_star", pt_view_single, "single") + meta_tau_star_joint <- build_table("tau_star", pt_view_joint, "joint") + meta_E_single <- build_table("enrichment", pt_view_single, "single") + meta_ES_single <- build_table("enrichstat", pt_view_single, "single") + + # Combine tau_star single + joint into one wide frame. + meta_tau_star <- meta_tau_star_single + ord <- match(meta_tau_star$target, meta_tau_star_joint$target) + meta_tau_star$joint_mean <- meta_tau_star_joint$joint_mean[ord] + meta_tau_star$joint_se <- meta_tau_star_joint$joint_se[ord] + meta_tau_star$joint_p <- meta_tau_star_joint$joint_p[ord] + + # Two-channel enrichment meta: effect/SE from E meta, p from EnrichStat meta. + meta_enrichment <- meta_E_single + meta_enrichment$single_p <- meta_ES_single$single_p[match(meta_enrichment$target, + meta_ES_single$target)] + + # Pure EnrichStat meta (separate frame). + meta_enrichstat <- meta_ES_single + + list( + per_trait = per_trait, + meta = list( + tau_star = meta_tau_star, + enrichment = meta_enrichment, + enrichstat = meta_enrichstat + ), + params = list( + maf_cutoff = maf_cutoff, + M_ref = M_ref, + target_categories = target_categories, + n_baseline = length(baseline_categories), + baseline_categories = baseline_categories, + trait_names = trait_names + ) + ) } From 4f0f71b7c203a251afd006f61ac8c822aaf3bc5b Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 7 May 2026 09:08:21 -0400 Subject: [PATCH 4/4] fix test --- tests/testthat/test_LD.R | 694 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 669 insertions(+), 25 deletions(-) diff --git a/tests/testthat/test_LD.R b/tests/testthat/test_LD.R index 058c6460..203e7c36 100644 --- a/tests/testthat/test_LD.R +++ b/tests/testthat/test_LD.R @@ -230,7 +230,7 @@ test_that("partition_LD_matrix validates block structure properly", { } # Expect an error for invalid block structure - expect_error(partition_LD_matrix(invalid_ld_data), "Matrix does not have the expected block structure") + expect_error(partition_LD_matrix(invalid_ld_data), "Matrix lacks expected block structure") } file.remove(LD_meta_file_path) @@ -824,14 +824,27 @@ test_that("can_merge checks chromosome and size", { expect_false(pecotmr:::can_merge(b1, b3, max_size = 500)) }) -# ---- check_ld ---- -test_that("check_ld detects positive definite matrix", { +# =========================================================================== +# check_ld (regularize_ld) +# =========================================================================== + +test_that("check_ld reports PD for identity matrix", { R <- diag(5) - result <- check_ld(R, method = "check") + result <- check_ld(R) + expect_true(result$is_pd) + expect_true(result$is_psd) + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) + expect_equal(result$condition_number, 1) +}) + +test_that("check_ld reports PD for well-conditioned correlation matrix", { + R <- matrix(0.3, 4, 4) + diag(R) <- 1 + result <- check_ld(R) expect_true(result$is_pd) expect_true(result$is_psd) expect_equal(result$n_negative, 0) - expect_equal(result$min_eigenvalue, 1) expect_equal(result$method_applied, "none") }) @@ -839,45 +852,676 @@ test_that("check_ld detects non-PSD matrix", { R <- matrix(0.9, 3, 3) diag(R) <- 1 R[1, 3] <- R[3, 1] <- -0.5 - result <- check_ld(R, method = "check") + result <- check_ld(R) expect_false(result$is_psd) expect_true(result$n_negative > 0) expect_true(result$min_eigenvalue < 0) expect_equal(result$method_applied, "none") - expect_equal(result$R, R) }) -test_that("check_ld eigenfix repairs non-PSD matrix", { +test_that("check_ld shrink method modifies non-PD matrix", { R <- matrix(0.9, 3, 3) diag(R) <- 1 R[1, 3] <- R[3, 1] <- -0.5 - result <- check_ld(R, method = "eigenfix") - expect_equal(result$method_applied, "eigenfix") - result2 <- check_ld(result$R, method = "check") - expect_true(result2$is_psd) - expect_equal(diag(result$R), rep(1, 3)) - expect_true(isSymmetric(result$R)) + result <- check_ld(R, method = "shrink") + expect_equal(result$method_applied, "shrink") + expect_false(identical(result$R, R)) + # With strong enough shrinkage, result should be PD + result2 <- check_ld(R, method = "shrink", shrinkage = 0.5) + eig <- eigen(result2$R, symmetric = TRUE) + expect_true(all(eig$values > 0)) }) -test_that("check_ld shrink repairs non-PD matrix", { +test_that("check_ld eigenfix method improves non-PD matrix", { R <- matrix(0.9, 3, 3) diag(R) <- 1 R[1, 3] <- R[3, 1] <- -0.5 - result <- check_ld(R, method = "shrink", shrinkage = 0.1) - expect_equal(result$method_applied, "shrink") - result2 <- check_ld(result$R, method = "check") - expect_true(result2$is_pd) + original_min_eval <- min(eigen(R, symmetric = TRUE)$values) + result <- check_ld(R, method = "eigenfix") + expect_equal(result$method_applied, "eigenfix") + # Eigenfix should improve the minimum eigenvalue + fixed_min_eval <- min(eigen(result$R, symmetric = TRUE)$values) + expect_true(fixed_min_eval > original_min_eval) + # Unit diagonal preserved + expect_equal(diag(result$R), rep(1, 3)) + # Symmetry preserved + expect_equal(result$R, t(result$R)) }) -test_that("check_ld eigenfix is no-op on PSD matrix", { - R <- diag(5) +test_that("check_ld shrink does nothing when matrix is already PD", { + R <- diag(3) + result <- check_ld(R, method = "shrink") + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) +}) + +test_that("check_ld eigenfix does nothing when matrix is already PD", { + R <- diag(3) result <- check_ld(R, method = "eigenfix") expect_equal(result$method_applied, "none") expect_equal(result$R, R) }) -test_that("check_ld condition_number is correct", { - R <- diag(c(1, 0.5, 0.1)) - result <- check_ld(R, method = "check") - expect_equal(result$condition_number, 10) +# =========================================================================== +# extract_block_matrices: out-of-range blocks +# =========================================================================== + +test_that("extract_block_matrices warns and skips out-of-range blocks", { + mat <- diag(4) + vnames <- paste0("v", 1:4) + rownames(mat) <- colnames(mat) <- vnames + block_metadata <- data.frame( + block_id = c(1, 2), + start_idx = c(1, 10), + end_idx = c(2, 12), + chrom = c("1", "1"), + block_start = c(100, 500), + block_end = c(200, 600), + size = c(2, 3), + stringsAsFactors = FALSE + ) + expect_warning( + result <- pecotmr:::extract_block_matrices(mat, block_metadata, vnames), + "outside the range" + ) + valid_blocks <- result$ld_matrices[!sapply(result$ld_matrices, is.null)] + expect_equal(length(valid_blocks), 1) + expect_equal(nrow(valid_blocks[[1]]), 2) +}) + +# =========================================================================== +# resolve_ld_source: type detection with real fixtures +# =========================================================================== + +geno_test_data_dir <- test_path("test_data") +geno_region_all <- "chr21:17513228-17592874" + +test_that("resolve_ld_source detects PLINK2 from metadata", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_p2_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "plink2") + expect_equal(result$meta_path, meta_file) +}) + +test_that("resolve_ld_source detects VCF from metadata", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_vcf_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.vcf.gz", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "vcf") +}) + +test_that("resolve_ld_source detects GDS from metadata", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_gds_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.gds", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "gds") +}) + +test_that("resolve_ld_source detects precomputed from metadata", { + # Existing LD block metadata with non-zero start/end + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_pre_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("chr1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "precomputed") +}) + +test_that("resolve_ld_source errors on missing file", { + expect_error(pecotmr:::resolve_ld_source("/nonexistent/file.tsv"), "not found") +}) + +test_that("resolve_ld_source errors on 0:0 sentinel with non-genotype path", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_bad_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "nonexistent_prefix", sep = "\t"), "\n", + file = meta_file, append = TRUE) + expect_error(pecotmr:::resolve_ld_source(meta_file), "0:0 sentinel") +}) + +# =========================================================================== +# resolve_genotype_path_for_region +# =========================================================================== + +test_that("resolve_genotype_path_for_region resolves correct chromosome path", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_path_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_genotype_path_for_region(meta_file, geno_region_all) + expect_equal(result, file.path(geno_test_data_dir, "test_variants")) +}) + +test_that("resolve_genotype_path_for_region errors on missing chromosome", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_nochr_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + expect_error( + pecotmr:::resolve_genotype_path_for_region(meta_file, geno_region_all), + "No entry for chromosome" + ) +}) + +# =========================================================================== +# load_LD_from_genotype with real fixtures +# =========================================================================== + +test_that("load_LD_from_genotype returns LD matrix with .afreq", { + skip_if_not_installed("pgenlibr") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + result <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all) + expect_true(is.list(result)) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_true(isSymmetric(result$LD_matrix)) + expect_false(result$is_genotype) + # ref_panel should have allele_freq from .afreq file + expect_true("allele_freq" %in% names(result$ref_panel)) + expect_true(all(result$ref_panel$allele_freq > 0)) + expect_true(all(result$ref_panel$allele_freq < 1)) + # block_metadata + expect_true(is.data.frame(result$block_metadata)) + expect_equal(nrow(result$block_metadata), 1L) +}) + +test_that("load_LD_from_genotype returns genotype matrix when requested", { + skip_if_not_installed("pgenlibr") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + result <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all, + return_genotype = TRUE) + expect_true(result$is_genotype) + expect_equal(nrow(result$LD_matrix), 100L) # samples + expect_equal(ncol(result$LD_matrix), 349L) # variants +}) + +test_that("load_LD_from_genotype computes variance with n_sample", { + skip_if_not_installed("pgenlibr") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + result <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all, + n_sample = 100L) + expect_true("variance" %in% names(result$ref_panel)) + expect_true("n_nomiss" %in% names(result$ref_panel)) + expect_equal(result$ref_panel$n_nomiss[1], 100L) + expect_true(all(result$ref_panel$variance > 0)) +}) + +test_that("load_LD_from_genotype falls back to computed AF without .afreq", { + skip_if_not_installed("VariantAnnotation") + vcf_path <- file.path(geno_test_data_dir, "test_variants.vcf.gz") + result <- suppressWarnings( + pecotmr:::load_LD_from_genotype(vcf_path, geno_region_all) + ) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_true(isSymmetric(result$LD_matrix)) + # Allele frequencies computed from genotypes + expect_true("allele_freq" %in% names(result$ref_panel)) + expect_true(all(result$ref_panel$allele_freq > 0)) + expect_true(all(result$ref_panel$allele_freq < 1)) +}) + +test_that("load_LD_from_genotype works with GDS files", { + skip_if_not_installed("SNPRelate") + skip_if_not_installed("gdsfmt") + gds_path <- file.path(geno_test_data_dir, "test_variants.gds") + result <- pecotmr:::load_LD_from_genotype(gds_path, geno_region_all) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_true(isSymmetric(result$LD_matrix)) +}) + +test_that("load_LD_from_genotype .afreq and computed AF are consistent", { + skip_if_not_installed("pgenlibr") + skip_if_not_installed("SNPRelate") + skip_if_not_installed("gdsfmt") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + gds_path <- file.path(geno_test_data_dir, "test_variants.gds") + res_afreq <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all) + res_computed <- pecotmr:::load_LD_from_genotype(gds_path, geno_region_all) + # Allele frequencies should be close (same data, different source) + expect_true(max(abs(res_afreq$ref_panel$allele_freq - res_computed$ref_panel$allele_freq)) < 0.01) +}) + +# =========================================================================== +# load_LD_matrix with real genotype fixtures via metadata +# =========================================================================== + +test_that("load_LD_matrix dispatches to PLINK2 genotype source", { + skip_if_not_installed("pgenlibr") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_p2_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, geno_region_all) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_false(result$is_genotype) +}) + +test_that("load_LD_matrix dispatches to VCF genotype source", { + skip_if_not_installed("VariantAnnotation") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_vcf_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.vcf.gz", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- suppressWarnings(load_LD_matrix(meta_file, geno_region_all)) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) +}) + +test_that("load_LD_matrix dispatches to GDS genotype source", { + skip_if_not_installed("SNPRelate") + skip_if_not_installed("gdsfmt") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_gds_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.gds", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, geno_region_all) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) +}) + +test_that("load_LD_matrix return_genotype='auto' returns X for genotype source", { + skip_if_not_installed("pgenlibr") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_auto_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, geno_region_all, return_genotype = "auto") + expect_true(result$is_genotype) + expect_equal(nrow(result$LD_matrix), 100L) # samples + expect_equal(ncol(result$LD_matrix), 349L) # variants +}) + +test_that("load_LD_matrix return_genotype=TRUE errors for precomputed", { + meta_file <- gsub("//", "/", tempfile(pattern = "ld_meta_file", tmpdir = tempdir(), fileext = ".tsv")) + on.exit(unlink(meta_file), add = TRUE) + meta_df <- data.frame( + chrom = "chr1", start = 1000, end = 1200, + path = paste0( + "./test_data/LD_block_1.chr1_1000_1200.float16.txt.xz,", + "./test_data/LD_block_1.chr1_1000_1200.float16.bim" + ) + ) + write_delim(meta_df, meta_file, delim = "\t") + region <- data.frame(chrom = "chr1", start = 1000, end = 1190) + expect_error( + load_LD_matrix(meta_file, region, return_genotype = TRUE), + "genotype files" + ) +}) + +# =========================================================================== +# resolve_ld_source: PLINK1 detection +# =========================================================================== + +test_that("resolve_ld_source detects PLINK1 from metadata", { + skip_if_not_installed("snpStats") + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_p1_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("22", "0", "0", "protocol_example.genotype", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "plink1") +}) + +# =========================================================================== +# load_LD_matrix: precomputed blocks via real .cor.xz fixtures +# =========================================================================== + +test_that("load_LD_matrix loads single precomputed block", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_single_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, "chr1:1000-1190") + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 5L) + expect_true(isSymmetric(result$LD_matrix)) + expect_equal(length(result$LD_variants), 5L) + expect_true(all(grepl("^chr1:", result$LD_variants))) + expect_false(result$is_genotype) + # block_metadata should have one block + expect_equal(nrow(result$block_metadata), 1L) + # ref_panel should have variant info + expect_true("chrom" %in% names(result$ref_panel)) + expect_true("pos" %in% names(result$ref_panel)) +}) + +test_that("load_LD_matrix loads multiple precomputed blocks", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_multi_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + lines <- c( + paste("chrom", "start", "end", "path", sep = "\t"), + paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), + paste("1", "1200", "1400", + "LD_block_2.chr1_1200_1400.float16.txt.xz,LD_block_2.chr1_1200_1400.float16.bim", + sep = "\t"), + paste("1", "1400", "1600", + "LD_block_3.chr1_1400_1600.float16.txt.xz,LD_block_3.chr1_1400_1600.float16.bim", + sep = "\t") + ) + writeLines(lines, meta_file) + result <- load_LD_matrix(meta_file, "chr1:1000-1500") + expect_true(is.matrix(result$LD_matrix)) + # Should span blocks 1-3: 5 + 5 + 5 = 15 unique variants (no overlap in variant IDs) + expect_true(nrow(result$LD_matrix) >= 10) + expect_true(isSymmetric(result$LD_matrix)) + expect_true(nrow(result$block_metadata) >= 2) +}) + +test_that("load_LD_matrix with n_sample for precomputed blocks with freq data", { + # The 9-column bim format includes allele_freq, variance, n_nomiss; + # the 6-column bim does not. With 6-col bim and no allele_freq, + # n_sample cannot compute variance — ref_panel has base columns only. + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_nsamp_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, "chr1:1000-1190", n_sample = 500L) + # ref_panel should always have basic variant info + expect_true(all(c("chrom", "pos", "A2", "A1", "variant_id") %in% names(result$ref_panel))) + # 6-col bim lacks allele_freq so variance computation is skipped + expect_false("variance" %in% names(result$ref_panel)) +}) + +# =========================================================================== +# process_LD_matrix: real .cor.xz fixtures +# =========================================================================== + +test_that("process_LD_matrix reads .cor.xz with explicit bim path", { + ld_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.txt.xz") + bim_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.bim") + result <- pecotmr:::process_LD_matrix(ld_file, bim_file) + expect_true(is.list(result)) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 5L) + expect_equal(ncol(result$LD_matrix), 5L) + expect_true(isSymmetric(result$LD_matrix)) + # Diagonal should be 1 + expect_true(all(abs(diag(result$LD_matrix) - 1) < 1e-4)) + # Variant names should be chr:pos:A2:A1 format + expect_true(all(grepl("^chr1:", rownames(result$LD_matrix)))) + # LD_variants data frame + expect_true(is.data.frame(result$LD_variants)) + expect_true("variants" %in% names(result$LD_variants)) + expect_equal(nrow(result$LD_variants), 5L) +}) + +test_that("process_LD_matrix reads different blocks consistently", { + bim1 <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.bim") + bim2 <- file.path(geno_test_data_dir, "LD_block_2.chr1_1200_1400.float16.bim") + ld1 <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.txt.xz") + ld2 <- file.path(geno_test_data_dir, "LD_block_2.chr1_1200_1400.float16.txt.xz") + r1 <- pecotmr:::process_LD_matrix(ld1, bim1) + r2 <- pecotmr:::process_LD_matrix(ld2, bim2) + # Different blocks should have different variant positions + expect_false(any(rownames(r1$LD_matrix) %in% rownames(r2$LD_matrix))) +}) + +test_that("process_LD_matrix reads 9-column bim with allele_freq/variance/n_nomiss", { + ld_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.txt.xz") + bim_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.9col.bim") + result <- pecotmr:::process_LD_matrix(ld_file, bim_file) + expect_equal(nrow(result$LD_matrix), 5L) + expect_true(isSymmetric(result$LD_matrix)) + # 9-column bim should include extra columns + expect_true("allele_freq" %in% names(result$LD_variants)) + expect_true("variance" %in% names(result$LD_variants)) + expect_true("n_nomiss" %in% names(result$LD_variants)) + expect_equal(result$LD_variants$allele_freq, c(0.3, 0.4, 0.2, 0.5, 0.15)) + expect_equal(result$LD_variants$n_nomiss, rep(500, 5)) +}) + +test_that("load_LD_matrix propagates allele_freq/variance/n_nomiss from 9-col bim", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_9col_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.9col.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, "chr1:1000-1190") + # ref_panel should carry the extra columns from the 9-col bim + expect_true("allele_freq" %in% names(result$ref_panel)) + expect_true("variance" %in% names(result$ref_panel)) + expect_true("n_nomiss" %in% names(result$ref_panel)) + expect_equal(result$ref_panel$allele_freq, c(0.3, 0.4, 0.2, 0.5, 0.15)) + expect_equal(result$ref_panel$n_nomiss, rep(500, 5)) +}) + +# =========================================================================== +# get_regional_ld_meta: real .cor.xz fixtures +# =========================================================================== + +test_that("get_regional_ld_meta returns correct file paths for single block", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_regional_single_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + lines <- c( + paste("chrom", "start", "end", "path", sep = "\t"), + paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), + paste("1", "1200", "1400", + "LD_block_2.chr1_1200_1400.float16.txt.xz,LD_block_2.chr1_1200_1400.float16.bim", + sep = "\t") + ) + writeLines(lines, meta_file) + result <- pecotmr:::get_regional_ld_meta(meta_file, "chr1:1050-1150") + expect_true(is.list(result)) + expect_true(length(result$intersections$LD_file_paths) >= 1) + # All returned paths should exist + expect_true(all(file.exists(result$intersections$LD_file_paths))) + expect_true(all(file.exists(result$intersections$bim_file_paths))) +}) + +test_that("get_regional_ld_meta spans multiple blocks for wide region", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_regional_multi_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + lines <- c( + paste("chrom", "start", "end", "path", sep = "\t"), + paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), + paste("1", "1200", "1400", + "LD_block_2.chr1_1200_1400.float16.txt.xz,LD_block_2.chr1_1200_1400.float16.bim", + sep = "\t"), + paste("1", "1400", "1600", + "LD_block_3.chr1_1400_1600.float16.txt.xz,LD_block_3.chr1_1400_1600.float16.bim", + sep = "\t") + ) + writeLines(lines, meta_file) + result <- pecotmr:::get_regional_ld_meta(meta_file, "chr1:1000-1500") + expect_true(length(result$intersections$LD_file_paths) >= 2) +}) + +# =========================================================================== +# drop_collinear_columns: strategy variants +# =========================================================================== + +test_that("drop_collinear_columns variance strategy removes lowest-variance column", { + set.seed(42) + X <- matrix(rnorm(100 * 4), 100, 4) + colnames(X) <- c("a", "b", "c", "d") + # Make column "c" have near-zero variance + X[, "c"] <- X[1, "c"] + result <- pecotmr:::drop_collinear_columns( + X, c("b", "c", "d"), strategy = "variance" + ) + expect_false("c" %in% colnames(result)) + expect_equal(ncol(result), 3L) +}) + +test_that("drop_collinear_columns response_correlation strategy works", { + set.seed(42) + X <- matrix(rnorm(100 * 3), 100, 3) + colnames(X) <- c("a", "b", "c") + y <- X[, "a"] + rnorm(100, sd = 0.1) # y correlates strongly with "a" + result <- pecotmr:::drop_collinear_columns( + X, c("a", "b", "c"), strategy = "response_correlation", response = y + ) + # Should keep "a" (highest |cor| with response) and remove one of b/c + expect_true("a" %in% colnames(result)) + expect_equal(ncol(result), 2L) +}) + +test_that("drop_collinear_columns response_correlation errors without response", { + X <- matrix(1:12, 4, 3) + colnames(X) <- c("a", "b", "c") + expect_error( + pecotmr:::drop_collinear_columns(X, c("a", "b"), strategy = "response_correlation"), + "response must be supplied" + ) +}) + +test_that("drop_collinear_columns with single problematic column removes it", { + X <- matrix(rnorm(40), 10, 4) + colnames(X) <- c("a", "b", "c", "d") + result <- pecotmr:::drop_collinear_columns(X, "b", strategy = "correlation") + expect_false("b" %in% colnames(result)) + expect_equal(ncol(result), 3L) +}) + +# =========================================================================== +# enforce_design_full_rank: additional strategies and fallback paths +# =========================================================================== + +test_that("enforce_design_full_rank variance strategy produces full rank", { + set.seed(42) + X <- matrix(rnorm(100 * 4), 100, 4) + X[, 4] <- X[, 1] + X[, 2] # rank deficient + colnames(X) <- c("a", "b", "c", "d") + C <- matrix(rnorm(100), 100, 1) + result <- enforce_design_full_rank(X, C, strategy = "variance") + full_design <- cbind(1, result, C) + expect_equal(qr(full_design)$rank, ncol(full_design)) + expect_true(ncol(result) < ncol(X)) +}) + +test_that("enforce_design_full_rank response_correlation strategy works", { + set.seed(42) + X <- matrix(rnorm(100 * 4), 100, 4) + X[, 4] <- X[, 1] + X[, 2] + colnames(X) <- c("a", "b", "c", "d") + C <- matrix(rnorm(100), 100, 1) + y <- X[, "a"] + rnorm(100, sd = 0.1) + result <- enforce_design_full_rank(X, C, strategy = "response_correlation", response = y) + full_design <- cbind(1, result, C) + expect_equal(qr(full_design)$rank, ncol(full_design)) +}) + +test_that("enforce_design_full_rank returns unchanged X when already full rank", { + set.seed(42) + X <- matrix(rnorm(100 * 3), 100, 3) + colnames(X) <- c("a", "b", "c") + C <- matrix(rnorm(100), 100, 1) + result <- enforce_design_full_rank(X, C, strategy = "correlation") + expect_equal(ncol(result), ncol(X)) +}) + +test_that("enforce_design_full_rank fallback to correlation pruning works", { + set.seed(42) + n <- 50 + p <- 10 + X <- matrix(rnorm(n * 3), n, 3) + # Create highly collinear columns that are hard for iterative removal + X <- cbind(X, X[, 1] + rnorm(n, sd = 1e-10), + X[, 2] + rnorm(n, sd = 1e-10), + X[, 3] + rnorm(n, sd = 1e-10), + X[, 1] + X[, 2] + rnorm(n, sd = 1e-10)) + colnames(X) <- paste0("v", seq_len(ncol(X))) + C <- matrix(rnorm(n), n, 1) + result <- enforce_design_full_rank(X, C, strategy = "correlation", + max_iterations = 2L) + full_design <- cbind(1, result, C) + expect_equal(qr(full_design)$rank, ncol(full_design)) +}) + +# =========================================================================== +# ld_clump_by_score: edge cases +# =========================================================================== + +test_that("ld_clump_by_score errors on empty matrix", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(numeric(0), nrow = 10, ncol = 0) + expect_error(ld_clump_by_score(X, score = numeric(0), chr = integer(0), pos = integer(0)), + "at least one column") +}) + +test_that("ld_clump_by_score returns 1L for single variant", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(c(0, 1, 2, 1, 0), ncol = 1) + result <- ld_clump_by_score(X, score = 1.0, chr = 1L, pos = 100L) + expect_equal(result, 1L) +}) + +test_that("ld_clump_by_score errors on mismatched score length", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(rnorm(20), 5, 4) + expect_error(ld_clump_by_score(X, score = c(1, 2), chr = rep(1L, 4), pos = 1:4), + "length\\(score\\)") +}) + +test_that("ld_clump_by_score errors on mismatched chr/pos length", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(rnorm(20), 5, 4) + expect_error(ld_clump_by_score(X, score = runif(4), chr = rep(1L, 2), pos = 1:4), + "chr and pos") +}) + +# =========================================================================== +# ld_prune_by_correlation: verbose paths +# =========================================================================== + +test_that("ld_prune_by_correlation verbose reports pruning", { + # Create matrix with correlated columns + set.seed(42) + base <- rnorm(100) + X <- cbind(base, base + rnorm(100, sd = 0.1), rnorm(100), rnorm(100), rnorm(100)) + colnames(X) <- paste0("v", 1:5) + expect_message( + ld_prune_by_correlation(X, cor_thres = 0.5, verbose = TRUE), + "pruned" + ) +}) + +test_that("ld_prune_by_correlation verbose reports no pruning", { + # Create a small matrix with no correlated columns + set.seed(42) + X <- matrix(rnorm(500), 100, 5) + colnames(X) <- paste0("v", 1:5) + expect_message( + ld_prune_by_correlation(X, cor_thres = 0.999, verbose = TRUE), + "no columns pruned" + ) })