Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/environment/pixi.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[project]
[workspace]
name = "r-pecotmr"
channels = ["dnachun", "conda-forge", "bioconda"]
platforms = ["linux-64", "osx-arm64"]
Expand Down
8 changes: 6 additions & 2 deletions .github/recipe/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,19 @@ requirements:
- bioconductor-iranges
- bioconductor-qvalue
- bioconductor-s4vectors
- bioconductor-snprelate
- bioconductor-snpstats
- bioconductor-mungesumstats
- bioconductor-rsamtools
- r-base
- r-bglr
- r-bigsnpr
- r-coda
- r-coloc
- r-colocboost
- r-cpp11
- r-cpp11armadillo
- r-decor
- r-dofuture
- r-dplyr
- r-flashier
Expand All @@ -54,8 +60,6 @@ requirements:
- r-pgenlibr
- r-purrr
- r-qgg
- r-rcpp
- r-rcpparmadillo
- r-rcppdpr
- r-readr
- r-rfast
Expand Down
13 changes: 9 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ Authors@R: c(person("Gao Wang",role = c("cre","aut"),
License: MIT + file LICENSE
Imports:
Biostrings,
GenomicRanges,
IRanges,
Rcpp,
MungeSumstats,
Rsamtools,
S4Vectors,
coloc,
doFuture,
Expand Down Expand Up @@ -58,15 +60,18 @@ Suggests:
qgg,
qvalue,
rmarkdown,
gdsfmt,
SNPRelate,
snpStats,
testthat
testthat,
VariantAnnotation
Remotes:
stephenslab/fsusieR,
stephenslab/mvsusieR,
stephenslab/susieR,
LinkingTo:
Rcpp,
RcppArmadillo
cpp11,
cpp11armadillo
NeedsCompilation: yes
VignetteBuilder: knitr
RoxygenNote: 7.3.3
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ export(load_tsv_region)
export(load_twas_weights)
export(mash_pipeline)
export(mash_rand_null_sample)
export(match_ref_panel)
export(mcp_weights)
export(merge_mash_data)
export(merge_sumstats_matrices)
Expand Down Expand Up @@ -97,6 +98,7 @@ export(scad_weights)
export(sdpr)
export(sdpr_weights)
export(slalom)
export(standardise_sumstats_columns)
export(summary_stats_qc)
export(susie_ash_weights)
export(susie_inf_weights)
Expand Down Expand Up @@ -211,4 +213,4 @@ importFrom(utils,read.table)
importFrom(utils,tail)
importFrom(vctrs,vec_duplicate_detect)
importFrom(vroom,vroom)
useDynLib(pecotmr)
useDynLib(pecotmr, .registration = TRUE)
106 changes: 61 additions & 45 deletions R/LD.R
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,9 @@ process_LD_matrix <- function(LD_file_path, snp_file_path = NULL) {
}

if (is_pvar) {
# PLINK2 .pvar format: read via existing read_pvar_text()
LD_variants <- read_pvar_text(snp_file_path)
# read_pvar_text returns: chrom, id, pos, A2 (REF), A1 (ALT)
# PLINK2 .pvar format: read via existing read_pvar()
LD_variants <- read_pvar(snp_file_path)
# read_pvar returns: chrom, id, pos, A2 (REF), A1 (ALT)
LD_variants <- LD_variants %>%
mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
variants = normalize_variant_id(id)) %>%
Expand Down Expand Up @@ -286,21 +286,21 @@ create_LD_matrix <- function(LD_matrices, variants) {
load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL,
return_genotype = FALSE, n_sample = NULL) {
source <- resolve_ld_source(LD_meta_file_path)
is_plink <- source$type %in% c("plink2", "plink1")
is_geno <- source$type %in% c("plink2", "plink1", "vcf", "gds")

# "auto": return X for PLINK, R for pre-computed
if (identical(return_genotype, "auto")) return_genotype <- is_plink
# "auto": return X for genotype sources, R for pre-computed
if (identical(return_genotype, "auto")) return_genotype <- is_geno

if (is_plink) {
prefix <- resolve_plink_prefix_for_region(source$meta_path, region, source$type)
return(load_LD_from_genotype(prefix, region, source$type,
if (is_geno) {
geno_path <- resolve_genotype_path_for_region(source$meta_path, region)
return(load_LD_from_genotype(geno_path, region,
return_genotype = return_genotype,
n_sample = n_sample))
}

# Pre-computed LD blocks (.cor.xz)
if (return_genotype) {
stop("return_genotype=TRUE requires PLINK genotype files, not pre-computed LD matrices.")
stop("return_genotype=TRUE requires genotype files, not pre-computed LD matrices.")
}
load_LD_from_blocks(source$meta_path, region, extract_coordinates, n_sample = n_sample)
}
Expand All @@ -321,26 +321,43 @@ has_plink1_files <- function(prefix) {
file.exists(paste0(prefix, ".fam"))
}

#' @noRd
is_vcf_path <- function(path) {
grepl("\\.(vcf|vcf\\.gz|bcf)$", path) && file.exists(path)
}

#' @noRd
is_gds_path <- function(path) {
grepl("\\.gds$", path) && file.exists(path)
}

#' Check whether a path points to a genotype source (PLINK, VCF, or GDS).
#' @noRd
is_genotype_source <- function(path) {
has_plink2_files(path) || has_plink1_files(path) || is_vcf_path(path) || is_gds_path(path)
}

#' Resolve an LD source metadata TSV to its actual data type.
#'
#' The metadata TSV has columns: chrom, start, end, path. Two formats are supported:
#' The metadata TSV has columns: chrom, start, end, path. Three categories are
#' supported:
#' \itemize{
#' \item Pre-computed LD blocks (.cor.xz): many rows per chromosome, each with
#' specific start/end block boundaries and path pointing to .cor.xz files.
#' \item PLINK genotype files: one row per chromosome with start=0, end=0,
#' and path pointing to a per-chromosome PLINK prefix. The actual region
#' filter is applied by the PLINK loader, not by block boundaries.
#' \item Genotype files (PLINK2, PLINK1, VCF, or GDS): one row per chromosome
#' with start=0, end=0, and path pointing to a per-chromosome genotype file
#' or prefix. The actual region filter is applied by the genotype loader.
#' }
#'
#' This function peeks at the first row to determine the data type.
#' The actual per-chromosome PLINK prefix is resolved later by
#' \code{resolve_plink_prefix_for_region()} at load time.
#' The actual per-chromosome path is resolved later by
#' \code{resolve_genotype_path_for_region()} at load time.
#'
#' @param path Path to a metadata TSV file with columns chrom, start, end, path.
#' @return A list with:
#' \item{type}{"plink2", "plink1", or "precomputed"}
#' \item{data_path}{PLINK prefix from first row (for type detection only; actual
#' per-chromosome prefix is resolved at load time)}
#' \item{type}{"plink2", "plink1", "vcf", "gds", or "precomputed"}
#' \item{data_path}{Genotype path from first row (for type detection only; actual
#' per-chromosome path is resolved at load time)}
#' \item{meta_path}{The metadata TSV path (always set)}
#' @importFrom vroom vroom
#' @noRd
Expand All @@ -359,22 +376,24 @@ resolve_ld_source <- function(path) {

if (has_plink2_files(resolved)) return(list(type = "plink2", data_path = resolved, meta_path = path))
if (has_plink1_files(resolved)) return(list(type = "plink1", data_path = resolved, meta_path = path))
if (is_vcf_path(resolved)) return(list(type = "vcf", data_path = resolved, meta_path = path))
if (is_gds_path(resolved)) return(list(type = "gds", data_path = resolved, meta_path = path))

# Pre-computed .cor.xz blocks — verify not using 0:0 sentinel
if (!is.na(meta$start) && !is.na(meta$end) && meta$start == 0 && meta$end == 0) {
stop("Metadata has start=0, end=0 but path does not resolve to PLINK files: ", resolved,
"\n The 0:0 sentinel is only valid for whole-chromosome PLINK genotype files.")
stop("Metadata has start=0, end=0 but path does not resolve to genotype files: ", resolved,
"\n The 0:0 sentinel is only valid for whole-chromosome genotype files.")
}

list(type = "precomputed", meta_path = path)
}

#' Resolve the correct PLINK prefix for a given region from a metadata TSV.
#' Resolve the correct genotype path for a given region from a metadata TSV.
#' Reads the TSV, finds the row matching the query region's chromosome,
#' and returns the resolved PLINK prefix path.
#' and returns the resolved genotype file path or prefix.
#' @importFrom vroom vroom
#' @noRd
resolve_plink_prefix_for_region <- function(meta_path, region, source_type) {
resolve_genotype_path_for_region <- function(meta_path, region) {
parsed <- parse_region(region)
meta <- as.data.frame(vroom(meta_path, show_col_types = FALSE))
colnames(meta) <- c("chrom", "start", "end", "path")
Expand All @@ -391,17 +410,13 @@ resolve_plink_prefix_for_region <- function(meta_path, region, source_type) {

# ---------- Internal: load LD from genotype files ----------

#' Load genotype data from PLINK files and compute LD or return genotype matrix.
#' @param source_type Character, "plink1" or "plink2" (from resolve_ld_source).
#' Load genotype data and compute LD or return genotype matrix.
#' @noRd
load_LD_from_genotype <- function(prefix, region, source_type,
load_LD_from_genotype <- function(genotype_path, region,
return_genotype = FALSE, n_sample = NULL) {
# Load genotype matrix and variant info
result <- if (source_type == "plink2") {
load_plink2_data(prefix, region = region)
} else {
load_plink1_data(prefix, region = region)
}
# Load genotype matrix and variant info via the unified loader
result <- load_genotype_region(genotype_path, region = region,
return_variant_info = TRUE)
X <- result$X
variant_info <- result$variant_info

Expand All @@ -415,19 +430,20 @@ load_LD_from_genotype <- function(prefix, region, source_type,
ref_panel <- parse_variant_id(variant_ids)
ref_panel$variant_id <- variant_ids

# Load allele frequency from .afreq file (required for PLINK sources)
afreq <- read_afreq(prefix)
if (is.null(afreq)) {
stop("Allele frequency file (.afreq or .afreq.zst) not found at prefix: ", prefix,
"\n The .afreq file is required for PLINK genotype LD sources.")
}
freq_match <- match(variant_info$id, afreq$id)
n_unmatched <- sum(is.na(freq_match))
if (n_unmatched > 0) {
warning(n_unmatched, " out of ", length(freq_match),
" variants have no allele frequency in .afreq file.")
# Load allele frequency from .afreq file if available, otherwise compute from genotypes
afreq <- read_afreq(genotype_path)
if (!is.null(afreq)) {
freq_match <- match(variant_info$id, afreq$id)
n_unmatched <- sum(is.na(freq_match))
if (n_unmatched > 0) {
warning(n_unmatched, " out of ", length(freq_match),
" variants have no allele frequency in .afreq file.")
}
ref_panel$allele_freq <- afreq$alt_freq[freq_match]
} else {
# Compute ALT allele frequency directly from the dosage matrix
ref_panel$allele_freq <- colMeans(X, na.rm = TRUE) / 2
}
ref_panel$allele_freq <- afreq$alt_freq[freq_match]

# Compute variance if sample size provided
if (!is.null(n_sample)) {
Expand Down
23 changes: 0 additions & 23 deletions R/RcppExports.R

This file was deleted.

11 changes: 8 additions & 3 deletions R/allele_qc.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' Match alleles between target data and reference variants
#' Match target data alleles against a reference panel
#'
#' Match by ("chrom", "A1", "A2" and "pos"), accounting for possible
#' strand flips and major/minor allele flips (opposite effects and zscores).
#' Flips specified columns when alleles are swapped relative to the reference.
#'
#' @param target_data A data frame with columns "chrom", "pos", "A2", "A1" (and optionally other columns like "beta" or "z"),
#' or a vector of strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1". Can be automatically converted to a data frame if a vector.
Expand All @@ -22,7 +23,7 @@
#' @importFrom vctrs vec_duplicate_detect
#' @importFrom tidyr separate
#' @export
allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
match_ref_panel <- function(target_data, ref_variants, col_to_flip = NULL,
match_min_prop = 0.2, remove_dups = TRUE,
remove_indels = FALSE, remove_strand_ambiguous = TRUE,
flip_strand = FALSE, remove_unmatched = TRUE, ...) {
Expand Down Expand Up @@ -183,6 +184,10 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
return(list(target_data_qced = result, qc_summary = match_result))
}

#' @rdname match_ref_panel
#' @export
allele_qc <- match_ref_panel

#' Align Variant Names
#'
#' This function aligns variant names from two strings containing variant names in the format of
Expand Down Expand Up @@ -228,7 +233,7 @@ align_variant_names <- function(source, reference, remove_indels = FALSE, remove
source_df <- parse_variant_id(source)
reference_df <- parse_variant_id(reference)

qc_result <- allele_qc(
qc_result <- match_ref_panel(
target_data = source_df,
ref_variants = reference_df,
col_to_flip = NULL,
Expand Down
11 changes: 8 additions & 3 deletions R/compute_qtl_enrichment.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,14 @@
#' en <- compute_qtl_enrichment(gwas_fit, susie_fits, lambda = lambda, ImpN = ImpN, num_threads = num_threads)
#'
#' @seealso \code{\link[susieR]{susie}}
#' @useDynLib pecotmr
#' @useDynLib pecotmr, .registration = TRUE
#' @export
#'
compute_qtl_enrichment <- function(gwas_pip, susie_qtl_regions,
num_gwas = NULL, pi_qtl = NULL,
lambda = 1.0, ImpN = 25,
double_shrinkage = FALSE,
bessel_correction = TRUE,
num_threads = 1, verbose = TRUE) {
if (is.null(num_gwas)) {
warning("num_gwas is not provided. Estimating pi_gwas from the data. Note that this estimate may be biased if the input gwas_pip does not contain genome-wide variants.")
Expand Down Expand Up @@ -111,14 +113,17 @@ compute_qtl_enrichment <- function(gwas_pip, susie_qtl_regions,
x
})

# cpp11 requires exact integer types for int parameters
en <- qtl_enrichment_rcpp(
r_gwas_pip = gwas_pip,
r_qtl_susie_fit = susie_qtl_regions,
pi_gwas = pi_gwas,
pi_qtl = pi_qtl,
ImpN = ImpN,
ImpN = as.integer(ImpN),
shrinkage_lambda = lambda,
num_threads = num_threads
double_shrinkage = double_shrinkage,
bessel_correction = bessel_correction,
num_threads = as.integer(num_threads)
)

# Add the unmatched variants to the output
Expand Down
21 changes: 21 additions & 0 deletions R/cpp11.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by cpp11: do not edit by hand

dentist_iterative_impute <- function(LD_mat_r, nSample, zScore_r, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose) {
.Call(`_pecotmr_dentist_iterative_impute`, LD_mat_r, nSample, zScore_r, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose)
}

lassosum_rss_rcpp <- function(z_r, LD, lambda_r, thr, maxiter) {
.Call(`_pecotmr_lassosum_rss_rcpp`, z_r, LD, lambda_r, thr, maxiter)
}

prs_cs_rcpp <- function(a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) {
.Call(`_pecotmr_prs_cs_rcpp`, a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed)
}

qtl_enrichment_rcpp <- function(r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, double_shrinkage, bessel_correction, num_threads) {
.Call(`_pecotmr_qtl_enrichment_rcpp`, r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, double_shrinkage, bessel_correction, num_threads)
}

sdpr_rcpp <- function(bhat_r, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed) {
.Call(`_pecotmr_sdpr_rcpp`, bhat_r, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed)
}
Loading