StatFunGen · danielnachun · Apr 21, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 21, 2026
diff --git a/.github/environment/pixi.toml b/.github/environment/pixi.toml
@@ -1,4 +1,4 @@
-[project]
+[workspace]
 name = "r-pecotmr"
 channels = ["dnachun", "conda-forge", "bioconda"]
 platforms = ["linux-64", "osx-arm64"]

diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml
@@ -27,13 +27,19 @@ requirements:
     - bioconductor-iranges
     - bioconductor-qvalue
     - bioconductor-s4vectors
+    - bioconductor-snprelate
     - bioconductor-snpstats
+    - bioconductor-mungesumstats
+    - bioconductor-rsamtools
     - r-base
     - r-bglr
     - r-bigsnpr
     - r-coda
     - r-coloc
     - r-colocboost
+    - r-cpp11
+    - r-cpp11armadillo
+    - r-decor
     - r-dofuture
     - r-dplyr
     - r-flashier
@@ -54,8 +60,6 @@ requirements:
     - r-pgenlibr
     - r-purrr
     - r-qgg
-    - r-rcpp
-    - r-rcpparmadillo
     - r-rcppdpr
     - r-readr
     - r-rfast

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,8 +15,10 @@ Authors@R: c(person("Gao Wang",role = c("cre","aut"),
 License: MIT + file LICENSE
 Imports:
     Biostrings,
+    GenomicRanges,
     IRanges,
-    Rcpp,
+    MungeSumstats,
+    Rsamtools,
     S4Vectors,
     coloc,
     doFuture,
@@ -58,15 +60,18 @@ Suggests:
     qgg,
     qvalue,
     rmarkdown,
+    gdsfmt,
+    SNPRelate,
     snpStats,
-    testthat
+    testthat,
+    VariantAnnotation
 Remotes:
     stephenslab/fsusieR,
     stephenslab/mvsusieR,
     stephenslab/susieR,
 LinkingTo:
-    Rcpp,
-    RcppArmadillo
+    cpp11,
+    cpp11armadillo
 NeedsCompilation: yes
 VignetteBuilder: knitr
 RoxygenNote: 7.3.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -69,6 +69,7 @@ export(load_tsv_region)
 export(load_twas_weights)
 export(mash_pipeline)
 export(mash_rand_null_sample)
+export(match_ref_panel)
 export(mcp_weights)
 export(merge_mash_data)
 export(merge_sumstats_matrices)
@@ -97,6 +98,7 @@ export(scad_weights)
 export(sdpr)
 export(sdpr_weights)
 export(slalom)
+export(standardise_sumstats_columns)
 export(summary_stats_qc)
 export(susie_ash_weights)
 export(susie_inf_weights)
@@ -211,4 +213,4 @@ importFrom(utils,read.table)
 importFrom(utils,tail)
 importFrom(vctrs,vec_duplicate_detect)
 importFrom(vroom,vroom)
-useDynLib(pecotmr)
+useDynLib(pecotmr, .registration = TRUE)
diff --git a/R/LD.R b/R/LD.R
@@ -152,9 +152,9 @@ process_LD_matrix <- function(LD_file_path, snp_file_path = NULL) {
   }
 
   if (is_pvar) {
-    # PLINK2 .pvar format: read via existing read_pvar_text()
-    LD_variants <- read_pvar_text(snp_file_path)
-    # read_pvar_text returns: chrom, id, pos, A2 (REF), A1 (ALT)
+    # PLINK2 .pvar format: read via existing read_pvar()
+    LD_variants <- read_pvar(snp_file_path)
+    # read_pvar returns: chrom, id, pos, A2 (REF), A1 (ALT)
     LD_variants <- LD_variants %>%
       mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
              variants = normalize_variant_id(id)) %>%
@@ -286,21 +286,21 @@ create_LD_matrix <- function(LD_matrices, variants) {
 load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL,
                            return_genotype = FALSE, n_sample = NULL) {
   source <- resolve_ld_source(LD_meta_file_path)
-  is_plink <- source$type %in% c("plink2", "plink1")
+  is_geno <- source$type %in% c("plink2", "plink1", "vcf", "gds")
 
-  # "auto": return X for PLINK, R for pre-computed
-  if (identical(return_genotype, "auto")) return_genotype <- is_plink
+  # "auto": return X for genotype sources, R for pre-computed
+  if (identical(return_genotype, "auto")) return_genotype <- is_geno
 
-  if (is_plink) {
-    prefix <- resolve_plink_prefix_for_region(source$meta_path, region, source$type)
-    return(load_LD_from_genotype(prefix, region, source$type,
+  if (is_geno) {
+    geno_path <- resolve_genotype_path_for_region(source$meta_path, region)
+    return(load_LD_from_genotype(geno_path, region,
                                  return_genotype = return_genotype,
                                  n_sample = n_sample))
   }
 
   # Pre-computed LD blocks (.cor.xz)
   if (return_genotype) {
-    stop("return_genotype=TRUE requires PLINK genotype files, not pre-computed LD matrices.")
+    stop("return_genotype=TRUE requires genotype files, not pre-computed LD matrices.")
   }
   load_LD_from_blocks(source$meta_path, region, extract_coordinates, n_sample = n_sample)
 }
@@ -321,26 +321,43 @@ has_plink1_files <- function(prefix) {
     file.exists(paste0(prefix, ".fam"))
 }
 
+#' @noRd
+is_vcf_path <- function(path) {
+  grepl("\\.(vcf|vcf\\.gz|bcf)$", path) && file.exists(path)
+}
+
+#' @noRd
+is_gds_path <- function(path) {
+  grepl("\\.gds$", path) && file.exists(path)
+}
+
+#' Check whether a path points to a genotype source (PLINK, VCF, or GDS).
+#' @noRd
+is_genotype_source <- function(path) {
+  has_plink2_files(path) || has_plink1_files(path) || is_vcf_path(path) || is_gds_path(path)
+}
+
 #' Resolve an LD source metadata TSV to its actual data type.
 #'
-#' The metadata TSV has columns: chrom, start, end, path. Two formats are supported:
+#' The metadata TSV has columns: chrom, start, end, path. Three categories are
+#' supported:
 #' \itemize{
 #'   \item Pre-computed LD blocks (.cor.xz): many rows per chromosome, each with
 #'     specific start/end block boundaries and path pointing to .cor.xz files.
-#'   \item PLINK genotype files: one row per chromosome with start=0, end=0,
-#'     and path pointing to a per-chromosome PLINK prefix. The actual region
-#'     filter is applied by the PLINK loader, not by block boundaries.
+#'   \item Genotype files (PLINK2, PLINK1, VCF, or GDS): one row per chromosome
+#'     with start=0, end=0, and path pointing to a per-chromosome genotype file
+#'     or prefix. The actual region filter is applied by the genotype loader.
 #' }
 #'
 #' This function peeks at the first row to determine the data type.
-#' The actual per-chromosome PLINK prefix is resolved later by
-#' \code{resolve_plink_prefix_for_region()} at load time.
+#' The actual per-chromosome path is resolved later by
+#' \code{resolve_genotype_path_for_region()} at load time.
 #'
 #' @param path Path to a metadata TSV file with columns chrom, start, end, path.
 #' @return A list with:
-#'   \item{type}{"plink2", "plink1", or "precomputed"}
-#'   \item{data_path}{PLINK prefix from first row (for type detection only; actual
-#'     per-chromosome prefix is resolved at load time)}
+#'   \item{type}{"plink2", "plink1", "vcf", "gds", or "precomputed"}
+#'   \item{data_path}{Genotype path from first row (for type detection only; actual
+#'     per-chromosome path is resolved at load time)}
 #'   \item{meta_path}{The metadata TSV path (always set)}
 #' @importFrom vroom vroom
 #' @noRd
@@ -359,22 +376,24 @@ resolve_ld_source <- function(path) {
 
   if (has_plink2_files(resolved)) return(list(type = "plink2", data_path = resolved, meta_path = path))
   if (has_plink1_files(resolved)) return(list(type = "plink1", data_path = resolved, meta_path = path))
+  if (is_vcf_path(resolved)) return(list(type = "vcf", data_path = resolved, meta_path = path))
+  if (is_gds_path(resolved)) return(list(type = "gds", data_path = resolved, meta_path = path))
 
   # Pre-computed .cor.xz blocks — verify not using 0:0 sentinel
   if (!is.na(meta$start) && !is.na(meta$end) && meta$start == 0 && meta$end == 0) {
-    stop("Metadata has start=0, end=0 but path does not resolve to PLINK files: ", resolved,
-         "\n  The 0:0 sentinel is only valid for whole-chromosome PLINK genotype files.")
+    stop("Metadata has start=0, end=0 but path does not resolve to genotype files: ", resolved,
+         "\n  The 0:0 sentinel is only valid for whole-chromosome genotype files.")
   }
 
   list(type = "precomputed", meta_path = path)
 }
 
-#' Resolve the correct PLINK prefix for a given region from a metadata TSV.
+#' Resolve the correct genotype path for a given region from a metadata TSV.
 #' Reads the TSV, finds the row matching the query region's chromosome,
-#' and returns the resolved PLINK prefix path.
+#' and returns the resolved genotype file path or prefix.
 #' @importFrom vroom vroom
 #' @noRd
-resolve_plink_prefix_for_region <- function(meta_path, region, source_type) {
+resolve_genotype_path_for_region <- function(meta_path, region) {
   parsed <- parse_region(region)
   meta <- as.data.frame(vroom(meta_path, show_col_types = FALSE))
   colnames(meta) <- c("chrom", "start", "end", "path")
@@ -391,17 +410,13 @@ resolve_plink_prefix_for_region <- function(meta_path, region, source_type) {
 
 # ---------- Internal: load LD from genotype files ----------
 
-#' Load genotype data from PLINK files and compute LD or return genotype matrix.
-#' @param source_type Character, "plink1" or "plink2" (from resolve_ld_source).
+#' Load genotype data and compute LD or return genotype matrix.
 #' @noRd
-load_LD_from_genotype <- function(prefix, region, source_type,
+load_LD_from_genotype <- function(genotype_path, region,
                                   return_genotype = FALSE, n_sample = NULL) {
-  # Load genotype matrix and variant info
-  result <- if (source_type == "plink2") {
-    load_plink2_data(prefix, region = region)
-  } else {
-    load_plink1_data(prefix, region = region)
-  }
+  # Load genotype matrix and variant info via the unified loader
+  result <- load_genotype_region(genotype_path, region = region,
+                                 return_variant_info = TRUE)
   X <- result$X
   variant_info <- result$variant_info
 
@@ -415,19 +430,20 @@ load_LD_from_genotype <- function(prefix, region, source_type,
   ref_panel <- parse_variant_id(variant_ids)
   ref_panel$variant_id <- variant_ids
 
-  # Load allele frequency from .afreq file (required for PLINK sources)
-  afreq <- read_afreq(prefix)
-  if (is.null(afreq)) {
-    stop("Allele frequency file (.afreq or .afreq.zst) not found at prefix: ", prefix,
-         "\n  The .afreq file is required for PLINK genotype LD sources.")
-  }
-  freq_match <- match(variant_info$id, afreq$id)
-  n_unmatched <- sum(is.na(freq_match))
-  if (n_unmatched > 0) {
-    warning(n_unmatched, " out of ", length(freq_match),
-            " variants have no allele frequency in .afreq file.")
+  # Load allele frequency from .afreq file if available, otherwise compute from genotypes
+  afreq <- read_afreq(genotype_path)
+  if (!is.null(afreq)) {
+    freq_match <- match(variant_info$id, afreq$id)
+    n_unmatched <- sum(is.na(freq_match))
+    if (n_unmatched > 0) {
+      warning(n_unmatched, " out of ", length(freq_match),
+              " variants have no allele frequency in .afreq file.")
+    }
+    ref_panel$allele_freq <- afreq$alt_freq[freq_match]
+  } else {
+    # Compute ALT allele frequency directly from the dosage matrix
+    ref_panel$allele_freq <- colMeans(X, na.rm = TRUE) / 2
   }
-  ref_panel$allele_freq <- afreq$alt_freq[freq_match]
 
   # Compute variance if sample size provided
   if (!is.null(n_sample)) {

diff --git a/R/RcppExports.R b/R/RcppExports.R
diff --git a/R/allele_qc.R b/R/allele_qc.R
@@ -1,7 +1,8 @@
-#' Match alleles between target data and reference variants
+#' Match target data alleles against a reference panel
 #'
 #' Match by ("chrom", "A1", "A2" and "pos"), accounting for possible
 #' strand flips and major/minor allele flips (opposite effects and zscores).
+#' Flips specified columns when alleles are swapped relative to the reference.
 #'
 #' @param target_data A data frame with columns "chrom", "pos", "A2", "A1" (and optionally other columns like "beta" or "z"),
 #'   or a vector of strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1". Can be automatically converted to a data frame if a vector.
@@ -22,7 +23,7 @@
 #' @importFrom vctrs vec_duplicate_detect
 #' @importFrom tidyr separate
 #' @export
-allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
+match_ref_panel <- function(target_data, ref_variants, col_to_flip = NULL,
                      match_min_prop = 0.2, remove_dups = TRUE,
                      remove_indels = FALSE, remove_strand_ambiguous = TRUE,
                      flip_strand = FALSE, remove_unmatched = TRUE, ...) {
@@ -183,6 +184,10 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
   return(list(target_data_qced = result, qc_summary = match_result))
 }
 
+#' @rdname match_ref_panel
+#' @export
+allele_qc <- match_ref_panel
+
 #' Align Variant Names
 #'
 #' This function aligns variant names from two strings containing variant names in the format of
@@ -228,7 +233,7 @@ align_variant_names <- function(source, reference, remove_indels = FALSE, remove
   source_df <- parse_variant_id(source)
   reference_df <- parse_variant_id(reference)
 
-  qc_result <- allele_qc(
+  qc_result <- match_ref_panel(
     target_data = source_df,
     ref_variants = reference_df,
     col_to_flip = NULL,

diff --git a/R/compute_qtl_enrichment.R b/R/compute_qtl_enrichment.R
@@ -52,12 +52,14 @@
 #' en <- compute_qtl_enrichment(gwas_fit, susie_fits, lambda = lambda, ImpN = ImpN, num_threads = num_threads)
 #'
 #' @seealso \code{\link[susieR]{susie}}
-#' @useDynLib pecotmr
+#' @useDynLib pecotmr, .registration = TRUE
 #' @export
 #'
 compute_qtl_enrichment <- function(gwas_pip, susie_qtl_regions,
                                    num_gwas = NULL, pi_qtl = NULL,
                                    lambda = 1.0, ImpN = 25,
+                                   double_shrinkage = FALSE,
+                                   bessel_correction = TRUE,
                                    num_threads = 1, verbose = TRUE) {
   if (is.null(num_gwas)) {
     warning("num_gwas is not provided. Estimating pi_gwas from the data. Note that this estimate may be biased if the input gwas_pip does not contain genome-wide variants.")
@@ -111,14 +113,17 @@ compute_qtl_enrichment <- function(gwas_pip, susie_qtl_regions,
     x
   })
 
+  # cpp11 requires exact integer types for int parameters
   en <- qtl_enrichment_rcpp(
     r_gwas_pip = gwas_pip,
     r_qtl_susie_fit = susie_qtl_regions,
     pi_gwas = pi_gwas,
     pi_qtl = pi_qtl,
-    ImpN = ImpN,
+    ImpN = as.integer(ImpN),
     shrinkage_lambda = lambda,
-    num_threads = num_threads
+    double_shrinkage = double_shrinkage,
+    bessel_correction = bessel_correction,
+    num_threads = as.integer(num_threads)
   )
 
   # Add the unmatched variants to the output

diff --git a/R/cpp11.R b/R/cpp11.R
@@ -0,0 +1,21 @@
+# Generated by cpp11: do not edit by hand
+
+dentist_iterative_impute <- function(LD_mat_r, nSample, zScore_r, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose) {
+  .Call(`_pecotmr_dentist_iterative_impute`, LD_mat_r, nSample, zScore_r, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose)
+}
+
+lassosum_rss_rcpp <- function(z_r, LD, lambda_r, thr, maxiter) {
+  .Call(`_pecotmr_lassosum_rss_rcpp`, z_r, LD, lambda_r, thr, maxiter)
+}
+
+prs_cs_rcpp <- function(a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) {
+  .Call(`_pecotmr_prs_cs_rcpp`, a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed)
+}
+
+qtl_enrichment_rcpp <- function(r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, double_shrinkage, bessel_correction, num_threads) {
+  .Call(`_pecotmr_qtl_enrichment_rcpp`, r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, double_shrinkage, bessel_correction, num_threads)
+}
+
+sdpr_rcpp <- function(bhat_r, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed) {
+  .Call(`_pecotmr_sdpr_rcpp`, bhat_r, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed)
+}