diff --git a/DESCRIPTION b/DESCRIPTION index ca1a7691..035eb535 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -68,4 +68,4 @@ LinkingTo: RcppArmadillo NeedsCompilation: yes VignetteBuilder: knitr -RoxygenNote: 7.3.3 +Config/roxygen2/version: 8.0.0 diff --git a/R/sldsc_wrapper.R b/R/sldsc_wrapper.R new file mode 100644 index 00000000..b8e17c80 --- /dev/null +++ b/R/sldsc_wrapper.R @@ -0,0 +1,733 @@ +# Stratified LD Score Regression (S-LDSC) post-processing wrappers around polyfun. +# +# This file provides the post-processing layer for the xqtl-protocol sLDSC pipeline: +# read polyfun outputs per trait, compute Gazal-style standardized tau* and the +# differential per-SNP heritability statistic (EnrichStat), and run DerSimonian-Laird +# random-effects meta-analysis across traits. +# +# Reference panel convention: all LD-derived quantities (baseline LD scores, +# target LD scores, regression weights, allele frequencies) must come from the +# same reference panel. Do not mix files from different panels (e.g. 1000G vs ADSP). +# +# MAF convention: by default we restrict to MAF > 5% per the sLDSC recommendation. +# Pass maf_cutoff = 0 to opt out (not recommended). +# +# Cross-type comparison: tau* (Gazal et al. 2017 standardization) is the +# cross-type comparable statistic. Use tau* to rank or meta-analyze annotations +# that mix binary and continuous types. E (proportion-based enrichment) is +# scale-dependent for continuous annotations and is only comparable within type. + +# ---- internal helpers ---- + +.sldsc_std_cols <- c("CHR", "SNP", "BP", "CM", "A1", "A2", "MAF") + +.sldsc_chrom_from_filename <- function(f) { + bn <- basename(f) + m <- regmatches(bn, regexec("\\.([0-9]+)\\.annot\\.gz$", bn))[[1]] + if (length(m) >= 2) as.integer(m[2]) else NA_integer_ +} + +.sldsc_detect_annot_cols <- function(file_path) { + sample <- data.table::fread(file_path, nrows = 5L) + setdiff(names(sample), .sldsc_std_cols) +} + + +#' @title Read S-LDSC outputs from polyfun for one trait/run +#' +#' @description Reads the regression outputs produced by `polyfun/ldsc.py` for a +#' single polyfun run (one trait, one annotation set) and returns them as a +#' tidy list ready for downstream standardization. Hides the underlying file +#' formats; downstream code consumes only modeling quantities. +#' +#' @param prefix Character. Path prefix to the polyfun outputs for one trait/run. +#' The function appends `.results`, `.log`, and `.part_delete` to this prefix. +#' Example: `"/path/to/cwd/CAD_META.filtered.sumstats.gz"`. +#' +#' @return A named list. See `sldsc_postprocessing_pipeline` for components. +#' +#' @examples +#' \dontrun{ +#' run <- read_sldsc_trait("/output/CAD_META.filtered.sumstats.gz") +#' run$tau["my_target_annotation"] +#' } +#' +#' @importFrom data.table fread +#' @importFrom stats setNames var +#' @export +read_sldsc_trait <- function(prefix) { + results_file <- paste0(prefix, ".results") + log_file <- paste0(prefix, ".log") + delete_file <- paste0(prefix, ".part_delete") + + for (f in c(results_file, log_file, delete_file)) { + if (!file.exists(f)) stop("read_sldsc_trait: missing file: ", f) + } + + results <- data.table::fread(results_file) + cats <- as.character(results$Category) + + log_lines <- readLines(log_file, warn = FALSE) + h2_line <- grep("Total Observed scale h2:", log_lines, value = TRUE) + if (length(h2_line) == 0L) + stop("read_sldsc_trait: could not find 'Total Observed scale h2:' in ", log_file) + h2g <- suppressWarnings(as.numeric(gsub(".*h2: (-?[0-9.eE+-]+).*", "\\1", h2_line[1]))) + if (is.na(h2g)) + stop("read_sldsc_trait: failed to parse h2g numeric from log line: ", h2_line[1]) + + delete_values <- as.matrix(data.table::fread(delete_file)) + if (ncol(delete_values) != length(cats)) { + stop("read_sldsc_trait: .part_delete has ", ncol(delete_values), + " columns but .results has ", length(cats), " categories.") + } + colnames(delete_values) <- cats + + list( + categories = cats, + tau = stats::setNames(as.numeric(results$Coefficient), cats), + tau_se = stats::setNames(as.numeric(results[["Coefficient_std_error"]]), cats), + enrichment = stats::setNames(as.numeric(results$Enrichment), cats), + enrichment_se = stats::setNames(as.numeric(results[["Enrichment_std_error"]]), cats), + enrichment_p = stats::setNames(as.numeric(results[["Enrichment_p"]]), cats), + prop_h2 = stats::setNames(as.numeric(results[["Prop._h2"]]), cats), + prop_snps = stats::setNames(as.numeric(results[["Prop._SNPs"]]), cats), + h2g = h2g, + tau_blocks = delete_values, + n_blocks = nrow(delete_values) + ) +} + + +#' @title Compute per-annotation standard deviation, MAF-restricted +#' +#' @description Computes the standard deviation of each annotation column in the +#' target annotation files, restricted to SNPs above a MAF cutoff via PLINK +#' `.frq` files. Required for internal consistency with polyfun's regression, +#' which operates on MAF > cutoff SNPs by default. +#' +#' @param target_anno_dir Character. Directory containing target annotation files +#' (one per chromosome) in polyfun's `.annot.gz` format. +#' @param frqfile_dir Character or NULL. Directory containing PLINK `.frq` files +#' for the reference panel. Required when `maf_cutoff > 0`; the function +#' errors if missing. +#' @param plink_name Character. Filename prefix of the `.frq` files +#' (e.g. `"ADSP_chr"`). Files are expected at `{plink_name}{chr}.frq`. +#' @param maf_cutoff Numeric, default `0.05`. +#' @param annot_cols Character or integer vector, default NULL. Annotation columns +#' to compute sd for. If NULL, all annotation columns are used. +#' +#' @return Named numeric vector of \eqn{sd_C} values, one per annotation. +#' +#' @importFrom data.table fread +#' @importFrom stats setNames var +#' @export +compute_sldsc_annot_sd <- function(target_anno_dir, frqfile_dir = NULL, + plink_name = "ADSP_chr", + maf_cutoff = 0.05, annot_cols = NULL) { + if (maf_cutoff > 0 && (is.null(frqfile_dir) || !dir.exists(frqfile_dir))) { + stop("compute_sldsc_annot_sd: maf_cutoff = ", maf_cutoff, + " requires frqfile_dir, but '", frqfile_dir, "' is not a directory.") + } + if (!dir.exists(target_anno_dir)) { + stop("compute_sldsc_annot_sd: target_anno_dir does not exist: ", target_anno_dir) + } + + anno_files <- list.files(target_anno_dir, pattern = "\\.annot\\.gz$", full.names = TRUE) + if (length(anno_files) == 0L) + stop("compute_sldsc_annot_sd: no .annot.gz files in: ", target_anno_dir) + + detected <- .sldsc_detect_annot_cols(anno_files[1]) + if (is.null(annot_cols)) { + cols_use <- detected + } else if (is.numeric(annot_cols)) { + cols_use <- detected[annot_cols] + } else { + cols_use <- annot_cols + } + if (length(cols_use) == 0L) + stop("compute_sldsc_annot_sd: no annotation columns to process.") + + num <- stats::setNames(numeric(length(cols_use)), cols_use) + den <- 0 + + for (anno_file in anno_files) { + dat <- data.table::fread(anno_file) + if (maf_cutoff > 0) { + chrom <- .sldsc_chrom_from_filename(anno_file) + if (is.na(chrom)) + stop("compute_sldsc_annot_sd: could not parse chromosome from: ", anno_file) + frq_file <- file.path(frqfile_dir, paste0(plink_name, chrom, ".frq")) + if (!file.exists(frq_file)) + stop("compute_sldsc_annot_sd: .frq file not found: ", frq_file) + frq <- data.table::fread(frq_file, select = c("SNP", "MAF")) + dat <- merge(dat, frq, by = "SNP", all.x = FALSE, all.y = FALSE) + dat <- dat[!is.na(dat$MAF) & dat$MAF > maf_cutoff, ] + } + if (nrow(dat) <= 1L) next + n_minus_1 <- nrow(dat) - 1L + for (col in cols_use) { + vals <- as.numeric(dat[[col]]) + v <- stats::var(vals, na.rm = TRUE) + if (!is.na(v)) num[col] <- num[col] + n_minus_1 * v + } + den <- den + n_minus_1 + } + + if (den <= 0) + stop("compute_sldsc_annot_sd: zero degrees of freedom after MAF filtering.") + sqrt(num / den) +} + + +#' @title Reference-panel SNP count at a given MAF cutoff +#' +#' @description Returns the number of SNPs in the reference panel above the MAF +#' cutoff. When `maf_cutoff > 0`, counts MAF > cutoff entries across all +#' `.frq` files. When `maf_cutoff == 0`, counts rows of `.l2.ldscore` +#' files in `target_anno_dir`. +#' +#' @param target_anno_dir Character or NULL. Directory of `.l2.ldscore` files +#' produced by polyfun's `compute_ldscores.py`. Required when +#' `maf_cutoff == 0`. +#' @param frqfile_dir Character or NULL. Directory of PLINK `.frq` files. +#' Required when `maf_cutoff > 0`. +#' @param plink_name Character. Filename prefix of `.frq` files. +#' @param maf_cutoff Numeric, default `0.05`. +#' +#' @return Scalar integer. +#' +#' @importFrom data.table fread +#' @export +compute_sldsc_M_ref <- function(target_anno_dir = NULL, frqfile_dir = NULL, + plink_name = "ADSP_chr", maf_cutoff = 0.05) { + if (maf_cutoff > 0) { + if (is.null(frqfile_dir) || !dir.exists(frqfile_dir)) + stop("compute_sldsc_M_ref: maf_cutoff = ", maf_cutoff, + " requires frqfile_dir.") + pat <- paste0("^", gsub("([.])", "\\\\\\1", plink_name), "[0-9]+\\.frq$") + frq_files <- list.files(frqfile_dir, pattern = pat, full.names = TRUE) + if (length(frq_files) == 0L) + frq_files <- list.files(frqfile_dir, pattern = "\\.frq$", full.names = TRUE) + if (length(frq_files) == 0L) + stop("compute_sldsc_M_ref: no .frq files found in: ", frqfile_dir) + + total <- 0L + for (f in frq_files) { + frq <- data.table::fread(f, select = "MAF") + total <- total + sum(!is.na(frq$MAF) & frq$MAF > maf_cutoff) + } + return(as.integer(total)) + } + + if (is.null(target_anno_dir) || !dir.exists(target_anno_dir)) + stop("compute_sldsc_M_ref: maf_cutoff = 0 requires target_anno_dir.") + files <- list.files(target_anno_dir, + pattern = "\\.l2\\.ldscore\\.(gz|parquet)$", + full.names = TRUE) + if (length(files) == 0L) + stop("compute_sldsc_M_ref: no .l2.ldscore files in: ", target_anno_dir) + + total <- 0L + for (f in files) { + if (endsWith(f, ".parquet")) { + if (!requireNamespace("arrow", quietly = TRUE)) + stop("compute_sldsc_M_ref: install 'arrow' to read .parquet files.") + total <- total + nrow(arrow::read_parquet(f)) + } else { + total <- total + nrow(data.table::fread(f)) + } + } + as.integer(total) +} + + +#' @title Detect whether each annotation is binary or continuous +#' +#' @description Inspects each annotation column and returns whether its values +#' lie in \{0, 1\} (binary) or take other values (continuous). +#' +#' @param target_anno_dir Character. Directory containing the target `.annot.gz` +#' files (one per chromosome). +#' @param annot_cols Character or integer vector, default NULL. +#' +#' @return Named logical vector: TRUE for binary, FALSE for continuous. +#' +#' @importFrom data.table fread +#' @importFrom stats setNames +#' @export +is_binary_sldsc_annot <- function(target_anno_dir, annot_cols = NULL) { + anno_files <- list.files(target_anno_dir, pattern = "\\.annot\\.gz$", full.names = TRUE) + if (length(anno_files) == 0L) + stop("is_binary_sldsc_annot: no .annot.gz files in: ", target_anno_dir) + + detected <- .sldsc_detect_annot_cols(anno_files[1]) + if (is.null(annot_cols)) { + cols_use <- detected + } else if (is.numeric(annot_cols)) { + cols_use <- detected[annot_cols] + } else { + cols_use <- annot_cols + } + + is_binary <- stats::setNames(rep(TRUE, length(cols_use)), cols_use) + + for (f in anno_files) { + dat <- data.table::fread(f, select = cols_use) + for (col in cols_use) { + if (!is_binary[[col]]) next + vals <- unique(stats::na.omit(as.numeric(dat[[col]]))) + if (any(!(vals %in% c(0, 1)))) is_binary[[col]] <- FALSE + } + if (!any(is_binary)) break + } + + is_binary +} + + +#' @title Standardize tau and compute EnrichStat for one polyfun run +#' +#' @description Applies the Gazal standardization +#' \eqn{\tau^*_C = \tau_C \cdot sd_C \cdot M_{ref} / h^2_g} to the point and +#' to each jackknife block. For `mode = "single"`, additionally computes +#' EnrichStat and back-solves its standard error from polyfun's reported +#' `Enrichment_p` using \eqn{|Z| = \Phi^{-1}(1 - p/2)}. +#' +#' @param trait_data List from \code{\link{read_sldsc_trait}}. +#' @param sd_annot Named numeric vector from \code{\link{compute_sldsc_annot_sd}}. +#' @param M_ref Scalar from \code{\link{compute_sldsc_M_ref}}. +#' @param target_categories Character vector or NULL. If NULL, intersects +#' `trait_data$categories` with `names(sd_annot)`. +#' @param mode Character: `"single"` or `"joint"`. +#' +#' @return A list with `summary` (data frame), `tau_star_blocks` (matrix), +#' `h2g`, `n_blocks`, `mode`. +#' +#' @importFrom stats qnorm var +#' @export +standardize_sldsc_trait <- function(trait_data, sd_annot, M_ref, + target_categories = NULL, + mode = c("single", "joint")) { + mode <- match.arg(mode) + if (is.null(target_categories)) + target_categories <- intersect(trait_data$categories, names(sd_annot)) + if (length(target_categories) == 0L) + stop("standardize_sldsc_trait: no target categories.") + + target_idx <- match(target_categories, trait_data$categories) + if (any(is.na(target_idx))) + stop("standardize_sldsc_trait: missing categories: ", + paste(target_categories[is.na(target_idx)], collapse = ", ")) + + h2g <- trait_data$h2g + sd_target <- as.numeric(sd_annot[target_categories]) + if (any(is.na(sd_target) | sd_target == 0)) + warning("standardize_sldsc_trait: zero/NA sd for some targets; tau* will be NA/0.") + + coef <- sd_target * M_ref / h2g + tau <- as.numeric(trait_data$tau[target_categories]) + tau_se <- as.numeric(trait_data$tau_se[target_categories]) + tau_star <- tau * coef + + blocks_target <- trait_data$tau_blocks[, target_idx, drop = FALSE] + tau_star_blocks <- sweep(blocks_target, 2L, coef, FUN = "*") + + B <- trait_data$n_blocks + jk_var <- apply(tau_star_blocks, 2L, function(x) stats::var(x, na.rm = TRUE)) + tau_star_se <- sqrt((B - 1)^2 / B * jk_var) + + summary_df <- data.frame( + target = target_categories, + tau = tau, + tau_se = tau_se, + tau_star = tau_star, + tau_star_se = tau_star_se, + stringsAsFactors = FALSE + ) + + if (mode == "single") { + enrich <- as.numeric(trait_data$enrichment[target_categories]) + enrich_se <- as.numeric(trait_data$enrichment_se[target_categories]) + enrich_p <- as.numeric(trait_data$enrichment_p[target_categories]) + p_h2 <- as.numeric(trait_data$prop_h2[target_categories]) + p_M <- as.numeric(trait_data$prop_snps[target_categories]) + + diff_ratio <- (p_h2 / p_M) - (1 - p_h2) / (1 - p_M) + enrichstat <- (h2g / M_ref) * diff_ratio + + abs_z <- stats::qnorm(1 - enrich_p / 2) + enrichstat_se <- abs(enrichstat) / abs_z + enrichstat_se[!is.finite(abs_z) | abs_z <= 0] <- NA_real_ + + summary_df$enrichment <- enrich + summary_df$enrichment_se <- enrich_se + summary_df$enrichment_p <- enrich_p + summary_df$enrichstat <- enrichstat + summary_df$enrichstat_se <- enrichstat_se + } + + list( + summary = summary_df, + tau_star_blocks = tau_star_blocks, + h2g = h2g, + n_blocks = B, + mode = mode + ) +} + + +#' @title Random-effects meta-analysis of S-LDSC quantities across traits +#' +#' @description DerSimonian-Laird random-effects meta-analysis of one S-LDSC +#' quantity for one annotation across multiple traits. +#' +#' @details Per-trait \eqn{SE_i} sources: +#' - `quantity = "tau_star"`: jackknife SE from per-block \eqn{\tau^*}. +#' - `quantity = "enrichment"`: polyfun-reported `Enrichment_std_error`. +#' - `quantity = "enrichstat"`: back-solved SE from polyfun's `Enrichment_p`. +#' +#' @param per_trait_estimates Named list of per-trait results (each with a +#' `summary` data frame). +#' @param category Character. Annotation name to meta-analyze. +#' @param quantity Character: `"tau_star"`, `"enrichment"`, or `"enrichstat"`. +#' +#' @return List with `mean`, `se`, `p`, `n_traits`, `traits_used`, `tau2`. +#' +#' @importFrom stats pnorm +#' @export +meta_sldsc_random <- function(per_trait_estimates, category, + quantity = c("tau_star", "enrichment", "enrichstat")) { + quantity <- match.arg(quantity) + col_pairs <- list( + tau_star = c("tau_star", "tau_star_se"), + enrichment = c("enrichment", "enrichment_se"), + enrichstat = c("enrichstat", "enrichstat_se") + ) + cols <- col_pairs[[quantity]] + trait_names <- names(per_trait_estimates) + if (is.null(trait_names)) + trait_names <- as.character(seq_along(per_trait_estimates)) + + means <- numeric(0); ses <- numeric(0); used <- character(0) + for (i in seq_along(per_trait_estimates)) { + pt <- per_trait_estimates[[i]] + if (is.null(pt) || is.null(pt$summary)) next + df <- pt$summary + row <- df[df$target == category, , drop = FALSE] + if (nrow(row) == 0L) next + if (!all(cols %in% names(row))) next + m <- as.numeric(row[[cols[1]]])[1] + s <- as.numeric(row[[cols[2]]])[1] + if (is.na(m) || is.na(s) || !is.finite(s) || s <= 0) next + means <- c(means, m); ses <- c(ses, s); used <- c(used, trait_names[i]) + } + + if (length(means) < 2L) { + return(list(mean = NA_real_, se = NA_real_, p = NA_real_, + n_traits = length(means), traits_used = used, + tau2 = NA_real_)) + } + if (!requireNamespace("rmeta", quietly = TRUE)) + stop("meta_sldsc_random: install the 'rmeta' package.") + + meta <- rmeta::meta.summaries(means, ses, method = "random") + z <- meta$summary / meta$se.summary + p <- 2 * stats::pnorm(-abs(z)) + list( + mean = as.numeric(meta$summary), + se = as.numeric(meta$se.summary), + p = as.numeric(p), + n_traits = length(means), + traits_used = used, + tau2 = as.numeric(meta$tau2) + ) +} + + +# Internal helper: assemble a wide per-trait summary frame with single + joint +# columns side by side. +.sldsc_assemble_trait_summary <- function(single_df, joint_df, target_categories, + is_binary_vec) { + rows <- if (!is.null(single_df)) single_df$target else + if (!is.null(joint_df)) joint_df$target else target_categories + out <- data.frame(target = rows, + is_binary = unname(is_binary_vec[rows]), + stringsAsFactors = FALSE) + + add_cols <- function(out, src, suffix) { + cols_to_add <- c("tau", "tau_se", "tau_star", "tau_star_se", + "enrichment", "enrichment_se", "enrichment_p", + "enrichstat", "enrichstat_se") + for (c in cols_to_add) { + newcol <- paste0(c, "_", suffix) + if (!is.null(src) && c %in% names(src)) { + out[[newcol]] <- src[[c]][match(out$target, src$target)] + } else { + out[[newcol]] <- NA_real_ + } + } + out + } + out <- add_cols(out, single_df, "single") + out <- add_cols(out, joint_df, "joint") + out +} + + +# Internal helper: build a per-trait list view that meta_sldsc_random can read. +# Each list element has a $summary frame with the requested mode's columns +# renamed to the canonical names (tau_star, tau_star_se, enrichment, ...). +.sldsc_view_for_meta <- function(per_trait, suffix) { + lapply(per_trait, function(pt) { + if (is.null(pt$summary)) return(NULL) + df <- pt$summary + cols_have <- c("tau_star", "tau_star_se", "enrichment", "enrichment_se", + "enrichment_p", "enrichstat", "enrichstat_se") + src_cols <- paste0(cols_have, "_", suffix) + avail <- src_cols %in% names(df) + if (!any(avail)) return(NULL) + new_df <- data.frame(target = df$target, stringsAsFactors = FALSE) + for (k in seq_along(cols_have)) { + if (avail[k]) new_df[[cols_have[k]]] <- df[[src_cols[k]]] + } + list(summary = new_df) + }) +} + + +#' @title End-to-end S-LDSC post-processing across traits, single + joint in one pass +#' +#' @description Top-level orchestration. Reads polyfun outputs (one single-target +#' run per target plus, when available, one joint run per trait), standardizes +#' both modes, and runs the default random-effects meta across all traits. +#' +#' @param trait_single_prefixes Named list. For each trait, a character vector +#' of length \eqn{N} giving the polyfun output prefixes for the \eqn{N} +#' single-target runs (order must match `target_categories`). +#' @param trait_joint_prefix Named character. For each trait, the polyfun output +#' prefix for the joint run. Pass `NA` (or `""`) for a trait without a joint run. +#' @param target_anno_dir Character. Directory of target `.annot.gz` files used +#' for `sd_C` and binary detection (typically the joint-mode dir). +#' @param frqfile_dir Character or NULL. +#' @param plink_name Character. Default `"ADSP_chr"`. +#' @param maf_cutoff Numeric, default `0.05`. +#' @param target_categories Character vector or NULL. Auto-detected from the +#' first available run if NULL. +#' +#' @return List with `per_trait`, `meta` (three frames), `params`. +#' +#' @export +sldsc_postprocessing_pipeline <- function(trait_single_prefixes, + trait_joint_prefix, + target_anno_dir, + frqfile_dir = NULL, + plink_name = "ADSP_chr", + maf_cutoff = 0.05, + target_categories = NULL) { + trait_names <- names(trait_single_prefixes) + if (is.null(trait_names)) + stop("sldsc_postprocessing_pipeline: trait_single_prefixes must be a named list.") + + message("[sldsc] Computing M_ref...") + M_ref <- compute_sldsc_M_ref(target_anno_dir = target_anno_dir, + frqfile_dir = frqfile_dir, + plink_name = plink_name, + maf_cutoff = maf_cutoff) + message(sprintf("[sldsc] M_ref = %d (MAF cutoff %g)", M_ref, maf_cutoff)) + + message("[sldsc] Computing per-annotation sd...") + sd_annot_full <- compute_sldsc_annot_sd(target_anno_dir = target_anno_dir, + frqfile_dir = frqfile_dir, + plink_name = plink_name, + maf_cutoff = maf_cutoff) + message(sprintf("[sldsc] sd computed for %d annotation columns", + length(sd_annot_full))) + + message("[sldsc] Detecting binary vs continuous annotations...") + is_binary_full <- is_binary_sldsc_annot(target_anno_dir = target_anno_dir) + + # Auto-detect target categories from a representative run. + if (is.null(target_categories)) { + pivot_run <- NULL + if (!is.null(trait_joint_prefix) && length(trait_joint_prefix) > 0) { + jp <- trait_joint_prefix[[1]] + if (is.character(jp) && length(jp) == 1L && !is.na(jp) && nzchar(jp)) { + pivot_run <- tryCatch(read_sldsc_trait(jp), error = function(e) NULL) + } + } + if (is.null(pivot_run) && + length(trait_single_prefixes) > 0L && + length(trait_single_prefixes[[1]]) > 0L) { + pivot_run <- tryCatch(read_sldsc_trait(trait_single_prefixes[[1]][1]), + error = function(e) NULL) + } + if (is.null(pivot_run)) + stop("sldsc_postprocessing_pipeline: cannot auto-detect target_categories.") + target_categories <- intersect(pivot_run$categories, names(sd_annot_full)) + message(sprintf("[sldsc] Auto-detected %d target categories", length(target_categories))) + } + + baseline_categories <- character(0) + if (!is.null(trait_joint_prefix) && length(trait_joint_prefix) > 0L) { + jp <- trait_joint_prefix[[1]] + if (is.character(jp) && length(jp) == 1L && !is.na(jp) && nzchar(jp)) { + pivot <- tryCatch(read_sldsc_trait(jp), error = function(e) NULL) + if (!is.null(pivot)) + baseline_categories <- setdiff(pivot$categories, target_categories) + } + } + if (length(baseline_categories) > 0L) { + msg_head <- paste(utils::head(baseline_categories, 5), collapse = ", ") + msg_tail <- if (length(baseline_categories) > 5) ", ..." else "" + message(sprintf("[sldsc] Detected %d baseline annotations: %s%s", + length(baseline_categories), msg_head, msg_tail)) + } else { + message("[sldsc] No baseline annotations detected (joint-run prefix missing or unreadable).") + } + + sd_annot <- sd_annot_full[target_categories] + is_binary <- if (length(is_binary_full) > 0L) is_binary_full[target_categories] else + stats::setNames(rep(FALSE, length(target_categories)), target_categories) + + message(sprintf("[sldsc] Standardizing %d traits...", length(trait_names))) + per_trait <- list() + + for (trait in trait_names) { + # ---- single-mode ---- + single_summaries <- list() + single_blocks <- list() + single_h2gs <- numeric(0) + sing_prefs <- trait_single_prefixes[[trait]] + for (i in seq_along(target_categories)) { + cat_name <- target_categories[i] + if (i > length(sing_prefs)) break + pref <- sing_prefs[i] + run <- tryCatch(read_sldsc_trait(pref), error = function(e) { + warning(sprintf("[sldsc] Failed to read single %s for %s: %s", + cat_name, trait, e$message)); NULL + }) + if (is.null(run)) next + std <- tryCatch( + standardize_sldsc_trait(run, sd_annot[cat_name], M_ref, + target_categories = cat_name, mode = "single"), + error = function(e) { + warning(sprintf("[sldsc] Failed to standardize single %s for %s: %s", + cat_name, trait, e$message)); NULL + }) + if (is.null(std)) next + single_summaries[[cat_name]] <- std$summary + single_blocks[[cat_name]] <- std$tau_star_blocks + single_h2gs <- c(single_h2gs, std$h2g) + } + single_df <- if (length(single_summaries) > 0L) + do.call(rbind, single_summaries) else NULL + if (!is.null(single_df)) rownames(single_df) <- NULL + blocks_single <- if (length(single_blocks) > 0L) do.call(cbind, single_blocks) else NULL + + # ---- joint-mode ---- + joint_df <- NULL + blocks_joint <- NULL + joint_h2g <- NA_real_ + n_blocks_trait <- NA_integer_ + if (!is.null(trait_joint_prefix) && trait %in% names(trait_joint_prefix)) { + jp <- trait_joint_prefix[[trait]] + if (is.character(jp) && length(jp) == 1L && !is.na(jp) && nzchar(jp)) { + run <- tryCatch(read_sldsc_trait(jp), error = function(e) { + warning(sprintf("[sldsc] Failed to read joint for %s: %s", + trait, e$message)); NULL + }) + if (!is.null(run)) { + std <- tryCatch( + standardize_sldsc_trait(run, sd_annot, M_ref, + target_categories = target_categories, + mode = "joint"), + error = function(e) { + warning(sprintf("[sldsc] Failed to standardize joint for %s: %s", + trait, e$message)); NULL + }) + if (!is.null(std)) { + joint_df <- std$summary + blocks_joint <- std$tau_star_blocks + joint_h2g <- std$h2g + n_blocks_trait <- std$n_blocks + } + } + } + } + + summary_wide <- .sldsc_assemble_trait_summary(single_df, joint_df, + target_categories, is_binary) + per_trait[[trait]] <- list( + summary = summary_wide, + tau_star_blocks_single = blocks_single, + tau_star_blocks_joint = blocks_joint, + h2g = if (!is.na(joint_h2g)) joint_h2g + else if (length(single_h2gs) > 0L) median(single_h2gs) + else NA_real_, + n_blocks = n_blocks_trait + ) + } + + message("[sldsc] Running random-effects meta across traits...") + pt_view_single <- .sldsc_view_for_meta(per_trait, "single") + pt_view_joint <- .sldsc_view_for_meta(per_trait, "joint") + + build_table <- function(quantity, view, label) { + rows <- list() + for (cat in target_categories) { + m <- meta_sldsc_random(view, cat, quantity) + rows[[cat]] <- data.frame( + target = cat, + is_binary = unname(is_binary[cat]), + mean = m$mean, + se = m$se, + p = m$p, + n_traits = m$n_traits, + stringsAsFactors = FALSE + ) + } + df <- do.call(rbind, rows) + rownames(df) <- NULL + nm_old <- c("mean", "se", "p") + nm_new <- paste0(label, "_", nm_old) + names(df)[names(df) %in% nm_old] <- nm_new + df + } + + meta_tau_star_single <- build_table("tau_star", pt_view_single, "single") + meta_tau_star_joint <- build_table("tau_star", pt_view_joint, "joint") + meta_E_single <- build_table("enrichment", pt_view_single, "single") + meta_ES_single <- build_table("enrichstat", pt_view_single, "single") + + # Combine tau_star single + joint into one wide frame. + meta_tau_star <- meta_tau_star_single + ord <- match(meta_tau_star$target, meta_tau_star_joint$target) + meta_tau_star$joint_mean <- meta_tau_star_joint$joint_mean[ord] + meta_tau_star$joint_se <- meta_tau_star_joint$joint_se[ord] + meta_tau_star$joint_p <- meta_tau_star_joint$joint_p[ord] + + # Two-channel enrichment meta: effect/SE from E meta, p from EnrichStat meta. + meta_enrichment <- meta_E_single + meta_enrichment$single_p <- meta_ES_single$single_p[match(meta_enrichment$target, + meta_ES_single$target)] + + # Pure EnrichStat meta (separate frame). + meta_enrichstat <- meta_ES_single + + list( + per_trait = per_trait, + meta = list( + tau_star = meta_tau_star, + enrichment = meta_enrichment, + enrichstat = meta_enrichstat + ), + params = list( + maf_cutoff = maf_cutoff, + M_ref = M_ref, + target_categories = target_categories, + n_baseline = length(baseline_categories), + baseline_categories = baseline_categories, + trait_names = trait_names + ) + ) +} diff --git a/tests/testthat/test_LD.R b/tests/testthat/test_LD.R index ea6a8d7b..203e7c36 100644 --- a/tests/testthat/test_LD.R +++ b/tests/testthat/test_LD.R @@ -230,7 +230,7 @@ test_that("partition_LD_matrix validates block structure properly", { } # Expect an error for invalid block structure - expect_error(partition_LD_matrix(invalid_ld_data), "Matrix does not have the expected block structure") + expect_error(partition_LD_matrix(invalid_ld_data), "Matrix lacks expected block structure") } file.remove(LD_meta_file_path) @@ -823,3 +823,705 @@ test_that("can_merge checks chromosome and size", { b3 <- data.frame(chrom = "2", size = 100) expect_false(pecotmr:::can_merge(b1, b3, max_size = 500)) }) + +# =========================================================================== +# check_ld (regularize_ld) +# =========================================================================== + +test_that("check_ld reports PD for identity matrix", { + R <- diag(5) + result <- check_ld(R) + expect_true(result$is_pd) + expect_true(result$is_psd) + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) + expect_equal(result$condition_number, 1) +}) + +test_that("check_ld reports PD for well-conditioned correlation matrix", { + R <- matrix(0.3, 4, 4) + diag(R) <- 1 + result <- check_ld(R) + expect_true(result$is_pd) + expect_true(result$is_psd) + expect_equal(result$n_negative, 0) + expect_equal(result$method_applied, "none") +}) + +test_that("check_ld detects non-PSD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + result <- check_ld(R) + expect_false(result$is_psd) + expect_true(result$n_negative > 0) + expect_true(result$min_eigenvalue < 0) + expect_equal(result$method_applied, "none") +}) + +test_that("check_ld shrink method modifies non-PD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + result <- check_ld(R, method = "shrink") + expect_equal(result$method_applied, "shrink") + expect_false(identical(result$R, R)) + # With strong enough shrinkage, result should be PD + result2 <- check_ld(R, method = "shrink", shrinkage = 0.5) + eig <- eigen(result2$R, symmetric = TRUE) + expect_true(all(eig$values > 0)) +}) + +test_that("check_ld eigenfix method improves non-PD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + original_min_eval <- min(eigen(R, symmetric = TRUE)$values) + result <- check_ld(R, method = "eigenfix") + expect_equal(result$method_applied, "eigenfix") + # Eigenfix should improve the minimum eigenvalue + fixed_min_eval <- min(eigen(result$R, symmetric = TRUE)$values) + expect_true(fixed_min_eval > original_min_eval) + # Unit diagonal preserved + expect_equal(diag(result$R), rep(1, 3)) + # Symmetry preserved + expect_equal(result$R, t(result$R)) +}) + +test_that("check_ld shrink does nothing when matrix is already PD", { + R <- diag(3) + result <- check_ld(R, method = "shrink") + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) +}) + +test_that("check_ld eigenfix does nothing when matrix is already PD", { + R <- diag(3) + result <- check_ld(R, method = "eigenfix") + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) +}) + +# =========================================================================== +# extract_block_matrices: out-of-range blocks +# =========================================================================== + +test_that("extract_block_matrices warns and skips out-of-range blocks", { + mat <- diag(4) + vnames <- paste0("v", 1:4) + rownames(mat) <- colnames(mat) <- vnames + block_metadata <- data.frame( + block_id = c(1, 2), + start_idx = c(1, 10), + end_idx = c(2, 12), + chrom = c("1", "1"), + block_start = c(100, 500), + block_end = c(200, 600), + size = c(2, 3), + stringsAsFactors = FALSE + ) + expect_warning( + result <- pecotmr:::extract_block_matrices(mat, block_metadata, vnames), + "outside the range" + ) + valid_blocks <- result$ld_matrices[!sapply(result$ld_matrices, is.null)] + expect_equal(length(valid_blocks), 1) + expect_equal(nrow(valid_blocks[[1]]), 2) +}) + +# =========================================================================== +# resolve_ld_source: type detection with real fixtures +# =========================================================================== + +geno_test_data_dir <- test_path("test_data") +geno_region_all <- "chr21:17513228-17592874" + +test_that("resolve_ld_source detects PLINK2 from metadata", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_p2_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "plink2") + expect_equal(result$meta_path, meta_file) +}) + +test_that("resolve_ld_source detects VCF from metadata", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_vcf_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.vcf.gz", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "vcf") +}) + +test_that("resolve_ld_source detects GDS from metadata", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_gds_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.gds", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "gds") +}) + +test_that("resolve_ld_source detects precomputed from metadata", { + # Existing LD block metadata with non-zero start/end + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_pre_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("chr1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "precomputed") +}) + +test_that("resolve_ld_source errors on missing file", { + expect_error(pecotmr:::resolve_ld_source("/nonexistent/file.tsv"), "not found") +}) + +test_that("resolve_ld_source errors on 0:0 sentinel with non-genotype path", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_bad_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "nonexistent_prefix", sep = "\t"), "\n", + file = meta_file, append = TRUE) + expect_error(pecotmr:::resolve_ld_source(meta_file), "0:0 sentinel") +}) + +# =========================================================================== +# resolve_genotype_path_for_region +# =========================================================================== + +test_that("resolve_genotype_path_for_region resolves correct chromosome path", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_path_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_genotype_path_for_region(meta_file, geno_region_all) + expect_equal(result, file.path(geno_test_data_dir, "test_variants")) +}) + +test_that("resolve_genotype_path_for_region errors on missing chromosome", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_nochr_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + expect_error( + pecotmr:::resolve_genotype_path_for_region(meta_file, geno_region_all), + "No entry for chromosome" + ) +}) + +# =========================================================================== +# load_LD_from_genotype with real fixtures +# =========================================================================== + +test_that("load_LD_from_genotype returns LD matrix with .afreq", { + skip_if_not_installed("pgenlibr") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + result <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all) + expect_true(is.list(result)) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_true(isSymmetric(result$LD_matrix)) + expect_false(result$is_genotype) + # ref_panel should have allele_freq from .afreq file + expect_true("allele_freq" %in% names(result$ref_panel)) + expect_true(all(result$ref_panel$allele_freq > 0)) + expect_true(all(result$ref_panel$allele_freq < 1)) + # block_metadata + expect_true(is.data.frame(result$block_metadata)) + expect_equal(nrow(result$block_metadata), 1L) +}) + +test_that("load_LD_from_genotype returns genotype matrix when requested", { + skip_if_not_installed("pgenlibr") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + result <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all, + return_genotype = TRUE) + expect_true(result$is_genotype) + expect_equal(nrow(result$LD_matrix), 100L) # samples + expect_equal(ncol(result$LD_matrix), 349L) # variants +}) + +test_that("load_LD_from_genotype computes variance with n_sample", { + skip_if_not_installed("pgenlibr") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + result <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all, + n_sample = 100L) + expect_true("variance" %in% names(result$ref_panel)) + expect_true("n_nomiss" %in% names(result$ref_panel)) + expect_equal(result$ref_panel$n_nomiss[1], 100L) + expect_true(all(result$ref_panel$variance > 0)) +}) + +test_that("load_LD_from_genotype falls back to computed AF without .afreq", { + skip_if_not_installed("VariantAnnotation") + vcf_path <- file.path(geno_test_data_dir, "test_variants.vcf.gz") + result <- suppressWarnings( + pecotmr:::load_LD_from_genotype(vcf_path, geno_region_all) + ) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_true(isSymmetric(result$LD_matrix)) + # Allele frequencies computed from genotypes + expect_true("allele_freq" %in% names(result$ref_panel)) + expect_true(all(result$ref_panel$allele_freq > 0)) + expect_true(all(result$ref_panel$allele_freq < 1)) +}) + +test_that("load_LD_from_genotype works with GDS files", { + skip_if_not_installed("SNPRelate") + skip_if_not_installed("gdsfmt") + gds_path <- file.path(geno_test_data_dir, "test_variants.gds") + result <- pecotmr:::load_LD_from_genotype(gds_path, geno_region_all) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_true(isSymmetric(result$LD_matrix)) +}) + +test_that("load_LD_from_genotype .afreq and computed AF are consistent", { + skip_if_not_installed("pgenlibr") + skip_if_not_installed("SNPRelate") + skip_if_not_installed("gdsfmt") + plink_prefix <- file.path(geno_test_data_dir, "test_variants") + gds_path <- file.path(geno_test_data_dir, "test_variants.gds") + res_afreq <- pecotmr:::load_LD_from_genotype(plink_prefix, geno_region_all) + res_computed <- pecotmr:::load_LD_from_genotype(gds_path, geno_region_all) + # Allele frequencies should be close (same data, different source) + expect_true(max(abs(res_afreq$ref_panel$allele_freq - res_computed$ref_panel$allele_freq)) < 0.01) +}) + +# =========================================================================== +# load_LD_matrix with real genotype fixtures via metadata +# =========================================================================== + +test_that("load_LD_matrix dispatches to PLINK2 genotype source", { + skip_if_not_installed("pgenlibr") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_p2_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, geno_region_all) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) + expect_false(result$is_genotype) +}) + +test_that("load_LD_matrix dispatches to VCF genotype source", { + skip_if_not_installed("VariantAnnotation") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_vcf_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.vcf.gz", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- suppressWarnings(load_LD_matrix(meta_file, geno_region_all)) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) +}) + +test_that("load_LD_matrix dispatches to GDS genotype source", { + skip_if_not_installed("SNPRelate") + skip_if_not_installed("gdsfmt") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_gds_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants.gds", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, geno_region_all) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 349L) +}) + +test_that("load_LD_matrix return_genotype='auto' returns X for genotype source", { + skip_if_not_installed("pgenlibr") + meta_file <- file.path(geno_test_data_dir, "ld_meta_ldmat_auto_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("21", "0", "0", "test_variants", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, geno_region_all, return_genotype = "auto") + expect_true(result$is_genotype) + expect_equal(nrow(result$LD_matrix), 100L) # samples + expect_equal(ncol(result$LD_matrix), 349L) # variants +}) + +test_that("load_LD_matrix return_genotype=TRUE errors for precomputed", { + meta_file <- gsub("//", "/", tempfile(pattern = "ld_meta_file", tmpdir = tempdir(), fileext = ".tsv")) + on.exit(unlink(meta_file), add = TRUE) + meta_df <- data.frame( + chrom = "chr1", start = 1000, end = 1200, + path = paste0( + "./test_data/LD_block_1.chr1_1000_1200.float16.txt.xz,", + "./test_data/LD_block_1.chr1_1000_1200.float16.bim" + ) + ) + write_delim(meta_df, meta_file, delim = "\t") + region <- data.frame(chrom = "chr1", start = 1000, end = 1190) + expect_error( + load_LD_matrix(meta_file, region, return_genotype = TRUE), + "genotype files" + ) +}) + +# =========================================================================== +# resolve_ld_source: PLINK1 detection +# =========================================================================== + +test_that("resolve_ld_source detects PLINK1 from metadata", { + skip_if_not_installed("snpStats") + meta_file <- file.path(geno_test_data_dir, "ld_meta_resolve_p1_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("22", "0", "0", "protocol_example.genotype", sep = "\t"), "\n", + file = meta_file, append = TRUE) + result <- pecotmr:::resolve_ld_source(meta_file) + expect_equal(result$type, "plink1") +}) + +# =========================================================================== +# load_LD_matrix: precomputed blocks via real .cor.xz fixtures +# =========================================================================== + +test_that("load_LD_matrix loads single precomputed block", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_single_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, "chr1:1000-1190") + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 5L) + expect_true(isSymmetric(result$LD_matrix)) + expect_equal(length(result$LD_variants), 5L) + expect_true(all(grepl("^chr1:", result$LD_variants))) + expect_false(result$is_genotype) + # block_metadata should have one block + expect_equal(nrow(result$block_metadata), 1L) + # ref_panel should have variant info + expect_true("chrom" %in% names(result$ref_panel)) + expect_true("pos" %in% names(result$ref_panel)) +}) + +test_that("load_LD_matrix loads multiple precomputed blocks", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_multi_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + lines <- c( + paste("chrom", "start", "end", "path", sep = "\t"), + paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), + paste("1", "1200", "1400", + "LD_block_2.chr1_1200_1400.float16.txt.xz,LD_block_2.chr1_1200_1400.float16.bim", + sep = "\t"), + paste("1", "1400", "1600", + "LD_block_3.chr1_1400_1600.float16.txt.xz,LD_block_3.chr1_1400_1600.float16.bim", + sep = "\t") + ) + writeLines(lines, meta_file) + result <- load_LD_matrix(meta_file, "chr1:1000-1500") + expect_true(is.matrix(result$LD_matrix)) + # Should span blocks 1-3: 5 + 5 + 5 = 15 unique variants (no overlap in variant IDs) + expect_true(nrow(result$LD_matrix) >= 10) + expect_true(isSymmetric(result$LD_matrix)) + expect_true(nrow(result$block_metadata) >= 2) +}) + +test_that("load_LD_matrix with n_sample for precomputed blocks with freq data", { + # The 9-column bim format includes allele_freq, variance, n_nomiss; + # the 6-column bim does not. With 6-col bim and no allele_freq, + # n_sample cannot compute variance — ref_panel has base columns only. + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_nsamp_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, "chr1:1000-1190", n_sample = 500L) + # ref_panel should always have basic variant info + expect_true(all(c("chrom", "pos", "A2", "A1", "variant_id") %in% names(result$ref_panel))) + # 6-col bim lacks allele_freq so variance computation is skipped + expect_false("variance" %in% names(result$ref_panel)) +}) + +# =========================================================================== +# process_LD_matrix: real .cor.xz fixtures +# =========================================================================== + +test_that("process_LD_matrix reads .cor.xz with explicit bim path", { + ld_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.txt.xz") + bim_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.bim") + result <- pecotmr:::process_LD_matrix(ld_file, bim_file) + expect_true(is.list(result)) + expect_true(is.matrix(result$LD_matrix)) + expect_equal(nrow(result$LD_matrix), 5L) + expect_equal(ncol(result$LD_matrix), 5L) + expect_true(isSymmetric(result$LD_matrix)) + # Diagonal should be 1 + expect_true(all(abs(diag(result$LD_matrix) - 1) < 1e-4)) + # Variant names should be chr:pos:A2:A1 format + expect_true(all(grepl("^chr1:", rownames(result$LD_matrix)))) + # LD_variants data frame + expect_true(is.data.frame(result$LD_variants)) + expect_true("variants" %in% names(result$LD_variants)) + expect_equal(nrow(result$LD_variants), 5L) +}) + +test_that("process_LD_matrix reads different blocks consistently", { + bim1 <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.bim") + bim2 <- file.path(geno_test_data_dir, "LD_block_2.chr1_1200_1400.float16.bim") + ld1 <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.txt.xz") + ld2 <- file.path(geno_test_data_dir, "LD_block_2.chr1_1200_1400.float16.txt.xz") + r1 <- pecotmr:::process_LD_matrix(ld1, bim1) + r2 <- pecotmr:::process_LD_matrix(ld2, bim2) + # Different blocks should have different variant positions + expect_false(any(rownames(r1$LD_matrix) %in% rownames(r2$LD_matrix))) +}) + +test_that("process_LD_matrix reads 9-column bim with allele_freq/variance/n_nomiss", { + ld_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.txt.xz") + bim_file <- file.path(geno_test_data_dir, "LD_block_1.chr1_1000_1200.float16.9col.bim") + result <- pecotmr:::process_LD_matrix(ld_file, bim_file) + expect_equal(nrow(result$LD_matrix), 5L) + expect_true(isSymmetric(result$LD_matrix)) + # 9-column bim should include extra columns + expect_true("allele_freq" %in% names(result$LD_variants)) + expect_true("variance" %in% names(result$LD_variants)) + expect_true("n_nomiss" %in% names(result$LD_variants)) + expect_equal(result$LD_variants$allele_freq, c(0.3, 0.4, 0.2, 0.5, 0.15)) + expect_equal(result$LD_variants$n_nomiss, rep(500, 5)) +}) + +test_that("load_LD_matrix propagates allele_freq/variance/n_nomiss from 9-col bim", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_precomp_9col_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + writeLines(paste("chrom", "start", "end", "path", sep = "\t"), meta_file) + cat(paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.9col.bim", + sep = "\t"), "\n", file = meta_file, append = TRUE) + result <- load_LD_matrix(meta_file, "chr1:1000-1190") + # ref_panel should carry the extra columns from the 9-col bim + expect_true("allele_freq" %in% names(result$ref_panel)) + expect_true("variance" %in% names(result$ref_panel)) + expect_true("n_nomiss" %in% names(result$ref_panel)) + expect_equal(result$ref_panel$allele_freq, c(0.3, 0.4, 0.2, 0.5, 0.15)) + expect_equal(result$ref_panel$n_nomiss, rep(500, 5)) +}) + +# =========================================================================== +# get_regional_ld_meta: real .cor.xz fixtures +# =========================================================================== + +test_that("get_regional_ld_meta returns correct file paths for single block", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_regional_single_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + lines <- c( + paste("chrom", "start", "end", "path", sep = "\t"), + paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), + paste("1", "1200", "1400", + "LD_block_2.chr1_1200_1400.float16.txt.xz,LD_block_2.chr1_1200_1400.float16.bim", + sep = "\t") + ) + writeLines(lines, meta_file) + result <- pecotmr:::get_regional_ld_meta(meta_file, "chr1:1050-1150") + expect_true(is.list(result)) + expect_true(length(result$intersections$LD_file_paths) >= 1) + # All returned paths should exist + expect_true(all(file.exists(result$intersections$LD_file_paths))) + expect_true(all(file.exists(result$intersections$bim_file_paths))) +}) + +test_that("get_regional_ld_meta spans multiple blocks for wide region", { + meta_file <- file.path(geno_test_data_dir, "ld_meta_regional_multi_tmp.tsv") + on.exit(unlink(meta_file), add = TRUE) + lines <- c( + paste("chrom", "start", "end", "path", sep = "\t"), + paste("1", "1000", "1200", + "LD_block_1.chr1_1000_1200.float16.txt.xz,LD_block_1.chr1_1000_1200.float16.bim", + sep = "\t"), + paste("1", "1200", "1400", + "LD_block_2.chr1_1200_1400.float16.txt.xz,LD_block_2.chr1_1200_1400.float16.bim", + sep = "\t"), + paste("1", "1400", "1600", + "LD_block_3.chr1_1400_1600.float16.txt.xz,LD_block_3.chr1_1400_1600.float16.bim", + sep = "\t") + ) + writeLines(lines, meta_file) + result <- pecotmr:::get_regional_ld_meta(meta_file, "chr1:1000-1500") + expect_true(length(result$intersections$LD_file_paths) >= 2) +}) + +# =========================================================================== +# drop_collinear_columns: strategy variants +# =========================================================================== + +test_that("drop_collinear_columns variance strategy removes lowest-variance column", { + set.seed(42) + X <- matrix(rnorm(100 * 4), 100, 4) + colnames(X) <- c("a", "b", "c", "d") + # Make column "c" have near-zero variance + X[, "c"] <- X[1, "c"] + result <- pecotmr:::drop_collinear_columns( + X, c("b", "c", "d"), strategy = "variance" + ) + expect_false("c" %in% colnames(result)) + expect_equal(ncol(result), 3L) +}) + +test_that("drop_collinear_columns response_correlation strategy works", { + set.seed(42) + X <- matrix(rnorm(100 * 3), 100, 3) + colnames(X) <- c("a", "b", "c") + y <- X[, "a"] + rnorm(100, sd = 0.1) # y correlates strongly with "a" + result <- pecotmr:::drop_collinear_columns( + X, c("a", "b", "c"), strategy = "response_correlation", response = y + ) + # Should keep "a" (highest |cor| with response) and remove one of b/c + expect_true("a" %in% colnames(result)) + expect_equal(ncol(result), 2L) +}) + +test_that("drop_collinear_columns response_correlation errors without response", { + X <- matrix(1:12, 4, 3) + colnames(X) <- c("a", "b", "c") + expect_error( + pecotmr:::drop_collinear_columns(X, c("a", "b"), strategy = "response_correlation"), + "response must be supplied" + ) +}) + +test_that("drop_collinear_columns with single problematic column removes it", { + X <- matrix(rnorm(40), 10, 4) + colnames(X) <- c("a", "b", "c", "d") + result <- pecotmr:::drop_collinear_columns(X, "b", strategy = "correlation") + expect_false("b" %in% colnames(result)) + expect_equal(ncol(result), 3L) +}) + +# =========================================================================== +# enforce_design_full_rank: additional strategies and fallback paths +# =========================================================================== + +test_that("enforce_design_full_rank variance strategy produces full rank", { + set.seed(42) + X <- matrix(rnorm(100 * 4), 100, 4) + X[, 4] <- X[, 1] + X[, 2] # rank deficient + colnames(X) <- c("a", "b", "c", "d") + C <- matrix(rnorm(100), 100, 1) + result <- enforce_design_full_rank(X, C, strategy = "variance") + full_design <- cbind(1, result, C) + expect_equal(qr(full_design)$rank, ncol(full_design)) + expect_true(ncol(result) < ncol(X)) +}) + +test_that("enforce_design_full_rank response_correlation strategy works", { + set.seed(42) + X <- matrix(rnorm(100 * 4), 100, 4) + X[, 4] <- X[, 1] + X[, 2] + colnames(X) <- c("a", "b", "c", "d") + C <- matrix(rnorm(100), 100, 1) + y <- X[, "a"] + rnorm(100, sd = 0.1) + result <- enforce_design_full_rank(X, C, strategy = "response_correlation", response = y) + full_design <- cbind(1, result, C) + expect_equal(qr(full_design)$rank, ncol(full_design)) +}) + +test_that("enforce_design_full_rank returns unchanged X when already full rank", { + set.seed(42) + X <- matrix(rnorm(100 * 3), 100, 3) + colnames(X) <- c("a", "b", "c") + C <- matrix(rnorm(100), 100, 1) + result <- enforce_design_full_rank(X, C, strategy = "correlation") + expect_equal(ncol(result), ncol(X)) +}) + +test_that("enforce_design_full_rank fallback to correlation pruning works", { + set.seed(42) + n <- 50 + p <- 10 + X <- matrix(rnorm(n * 3), n, 3) + # Create highly collinear columns that are hard for iterative removal + X <- cbind(X, X[, 1] + rnorm(n, sd = 1e-10), + X[, 2] + rnorm(n, sd = 1e-10), + X[, 3] + rnorm(n, sd = 1e-10), + X[, 1] + X[, 2] + rnorm(n, sd = 1e-10)) + colnames(X) <- paste0("v", seq_len(ncol(X))) + C <- matrix(rnorm(n), n, 1) + result <- enforce_design_full_rank(X, C, strategy = "correlation", + max_iterations = 2L) + full_design <- cbind(1, result, C) + expect_equal(qr(full_design)$rank, ncol(full_design)) +}) + +# =========================================================================== +# ld_clump_by_score: edge cases +# =========================================================================== + +test_that("ld_clump_by_score errors on empty matrix", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(numeric(0), nrow = 10, ncol = 0) + expect_error(ld_clump_by_score(X, score = numeric(0), chr = integer(0), pos = integer(0)), + "at least one column") +}) + +test_that("ld_clump_by_score returns 1L for single variant", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(c(0, 1, 2, 1, 0), ncol = 1) + result <- ld_clump_by_score(X, score = 1.0, chr = 1L, pos = 100L) + expect_equal(result, 1L) +}) + +test_that("ld_clump_by_score errors on mismatched score length", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(rnorm(20), 5, 4) + expect_error(ld_clump_by_score(X, score = c(1, 2), chr = rep(1L, 4), pos = 1:4), + "length\\(score\\)") +}) + +test_that("ld_clump_by_score errors on mismatched chr/pos length", { + skip_if_not_installed("bigsnpr") + skip_if_not_installed("bigstatsr") + X <- matrix(rnorm(20), 5, 4) + expect_error(ld_clump_by_score(X, score = runif(4), chr = rep(1L, 2), pos = 1:4), + "chr and pos") +}) + +# =========================================================================== +# ld_prune_by_correlation: verbose paths +# =========================================================================== + +test_that("ld_prune_by_correlation verbose reports pruning", { + # Create matrix with correlated columns + set.seed(42) + base <- rnorm(100) + X <- cbind(base, base + rnorm(100, sd = 0.1), rnorm(100), rnorm(100), rnorm(100)) + colnames(X) <- paste0("v", 1:5) + expect_message( + ld_prune_by_correlation(X, cor_thres = 0.5, verbose = TRUE), + "pruned" + ) +}) + +test_that("ld_prune_by_correlation verbose reports no pruning", { + # Create a small matrix with no correlated columns + set.seed(42) + X <- matrix(rnorm(500), 100, 5) + colnames(X) <- paste0("v", 1:5) + expect_message( + ld_prune_by_correlation(X, cor_thres = 0.999, verbose = TRUE), + "no columns pruned" + ) +})