From b5bea1ec008225d89d44d1ea3cb69cef043b6d2e Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 10:38:37 +1000 Subject: [PATCH 1/9] Adding coverage --- .github/workflows/coverage.yml | 44 +++++ coverage.sh | 203 ++++++++++++++++++++++ rust/bioscript-formats/tests/prepare.rs | 220 ++++++++++++++++++++++++ 3 files changed, 467 insertions(+) create mode 100644 .github/workflows/coverage.yml create mode 100755 coverage.sh create mode 100644 rust/bioscript-formats/tests/prepare.rs diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 0000000..8fe6bf0 --- /dev/null +++ b/.github/workflows/coverage.yml @@ -0,0 +1,44 @@ +name: Coverage + +on: + push: + branches: [main] + pull_request: + +permissions: + contents: read + +jobs: + focused-coverage: + name: focused coverage (${{ matrix.test }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + test: [file_formats, inspect, prepare, cli] + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + with: + components: llvm-tools-preview + + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + + - name: Cache Rust build + uses: Swatinem/rust-cache@v2 + with: + workspaces: rust + key: coverage-${{ matrix.test }} + + - name: Run focused coverage + env: + AUTO_INSTALL_LLVM_COV: "0" + AUTO_INSTALL_LLVM_TOOLS: "0" + TEST_THREADS: "1" + run: ./coverage.sh --no-lint --focused-test "${{ matrix.test }}" diff --git a/coverage.sh b/coverage.sh new file mode 100755 index 0000000..d2e6c0c --- /dev/null +++ b/coverage.sh @@ -0,0 +1,203 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Coverage runner for the BioScript Rust crates using cargo-llvm-cov. +# - Mirrors test.sh by operating inside ./rust +# - Runs focused BioScript tests by default, with optional all-tests mode +# - Generates HTML report and LCOV file +# - Prints a sorted summary to stdout + +FULL_CLEAN_FLAG=0 +OPEN_HTML_FLAG=${OPEN_HTML:-0} +LARGE_FLAG=0 +ALL_TESTS_FLAG=0 +NO_LINT_FLAG=0 +FOCUSED_TEST="" + +usage() { + cat <<'EOF' +Usage: ./coverage.sh [--full-clean|-c] [--open] [--large] [--all-tests] [--no-lint] [--focused-test name] + + --full-clean, -c Run cargo clean and remove coverage dirs before running + --open Open HTML report locally (no-op in CI) + --large Include tests that require large local fixtures + --all-tests Run all tests for the first-party BioScript crates + --no-lint Skip cargo fmt and clippy checks + --focused-test Run one focused integration test target: + file_formats, inspect, prepare, or cli + +Environment: + AUTO_INSTALL_LLVM_COV=0 Do not auto-install cargo-llvm-cov + AUTO_INSTALL_LLVM_TOOLS=0 Do not auto-install llvm-tools-preview + FULL_CLEAN=1 Same as --full-clean + LCOV_OUT=path Override LCOV output path + OPEN_HTML=1 Same as --open + TEST_THREADS=n Test thread count for focused integration tests +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --full-clean|-c) + FULL_CLEAN_FLAG=1 + ;; + --open) + OPEN_HTML_FLAG=1 + ;; + --large) + LARGE_FLAG=1 + ;; + --all-tests) + ALL_TESTS_FLAG=1 + ;; + --no-lint) + NO_LINT_FLAG=1 + ;; + --focused-test) + if [[ $# -lt 2 ]]; then + echo "--focused-test requires a test target name" >&2 + usage >&2 + exit 2 + fi + FOCUSED_TEST="$2" + shift + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac + shift +done + +ROOT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")" && pwd) +cd "$ROOT_DIR/rust" + +PACKAGES=( + bioscript-cli + bioscript-core + bioscript-formats + bioscript-runtime + bioscript-schema +) + +PKG_ARGS=() +for package in "${PACKAGES[@]}"; do + PKG_ARGS+=(-p "$package") +done + +TEST_RUSTFLAGS="${RUSTFLAGS:-} -Aunused-assignments -Amissing-docs" + +if [[ "$NO_LINT_FLAG" != "1" ]]; then + echo "==> Formatting and linting" + cargo fmt --check "${PKG_ARGS[@]}" + cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings +fi + +echo "==> Checking cargo-llvm-cov availability" +if ! cargo llvm-cov --version >/dev/null 2>&1; then + if [[ "${AUTO_INSTALL_LLVM_COV:-1}" == "1" ]]; then + echo "==> Installing cargo-llvm-cov (first run only)" + if ! cargo install cargo-llvm-cov; then + echo "Failed to install cargo-llvm-cov. Install manually with:" >&2 + echo " cargo install cargo-llvm-cov" >&2 + exit 1 + fi + else + echo "cargo-llvm-cov is not installed. Install with:" >&2 + echo " cargo install cargo-llvm-cov" >&2 + exit 1 + fi +fi + +if ! rustup component list --installed | grep -Eq '^(llvm-tools-preview|llvm-tools)'; then + if [[ "${AUTO_INSTALL_LLVM_TOOLS:-1}" == "1" ]]; then + echo "==> Installing rustup component: llvm-tools-preview (first run only)" + if ! rustup component add llvm-tools-preview; then + echo "Failed to install llvm-tools-preview. Install manually with:" >&2 + echo " rustup component add llvm-tools-preview" >&2 + exit 1 + fi + else + echo "llvm-tools-preview is missing. Enable auto-install or run:" >&2 + echo " rustup component add llvm-tools-preview" >&2 + exit 1 + fi +fi + +if [[ "${FULL_CLEAN:-0}" == "1" || "$FULL_CLEAN_FLAG" == "1" ]]; then + echo "==> FULL_CLEAN=1: performing cargo clean and removing coverage dirs" + cargo clean + rm -rf target/llvm-cov target/coverage target/llvm-cov-target +fi + +echo "==> Cleaning previous coverage artifacts" +cargo llvm-cov clean --workspace +mkdir -p target/coverage + +LCOV_OUT=${LCOV_OUT:-target/coverage/lcov.info} +TEST_THREADS=${TEST_THREADS:-1} +IGNORE_REGEX=${IGNORE_REGEX:-'(^|/)(monty|noodles|vendor)/'} +OPEN_FLAG="" +if [[ "$OPEN_HTML_FLAG" == "1" && "${CI:-0}" != "true" ]]; then + OPEN_FLAG="--open" +fi + +COV_ENV=(RUSTFLAGS="$TEST_RUSTFLAGS") +if [[ "$LARGE_FLAG" == "1" ]]; then + COV_ENV+=(BIOSCRIPT_RUN_LARGE_TESTS=1) +fi + +echo "==> Running coverage" +if [[ -n "$FOCUSED_TEST" ]]; then + case "$FOCUSED_TEST" in + file_formats) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-formats --test file_formats --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + ;; + inspect) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-formats --test inspect --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + ;; + prepare) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-formats --test prepare --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + ;; + cli) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-cli --test cli --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + ;; + *) + echo "Unknown focused test target: $FOCUSED_TEST" >&2 + usage >&2 + exit 2 + ;; + esac +elif [[ "$ALL_TESTS_FLAG" == "1" ]]; then + env "${COV_ENV[@]}" cargo llvm-cov "${PKG_ARGS[@]}" --all-targets --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG +else + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test file_formats -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test inspect -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test prepare -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --test cli -- --nocapture --test-threads="$TEST_THREADS" + cargo llvm-cov report "${PKG_ARGS[@]}" --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG +fi + +echo "==> Exporting LCOV" +cargo llvm-cov report "${PKG_ARGS[@]}" --ignore-filename-regex "$IGNORE_REGEX" --lcov --output-path "$LCOV_OUT" + +echo "==> Coverage summary (sorted by coverage %)" +SUMMARY_OUTPUT=$(cargo llvm-cov report "${PKG_ARGS[@]}" --ignore-filename-regex "$IGNORE_REGEX" --summary-only) +printf '%s\n' "$SUMMARY_OUTPUT" | head -n 3 +printf '%s\n' "$SUMMARY_OUTPUT" | tail -n +4 | grep -v "^TOTAL" | sort -t'%' -k3 -n +printf '%s\n' "$SUMMARY_OUTPUT" | grep "^TOTAL" + +HTML_DIR="target/llvm-cov/html" +if [[ -d "$HTML_DIR" ]]; then + echo "HTML report: rust/$HTML_DIR/index.html" +else + echo "HTML report directory not found. cargo-llvm-cov typically writes to target/llvm-cov/html" >&2 +fi + +echo "LCOV file: rust/$LCOV_OUT" diff --git a/rust/bioscript-formats/tests/prepare.rs b/rust/bioscript-formats/tests/prepare.rs new file mode 100644 index 0000000..4b99713 --- /dev/null +++ b/rust/bioscript-formats/tests/prepare.rs @@ -0,0 +1,220 @@ +use std::{ + fs, + path::PathBuf, + time::{SystemTime, UNIX_EPOCH}, +}; + +use bioscript_formats::{ + GenotypeSourceFormat, PrepareRequest, PreparedPaths, prepare_indexes, shell_flags, +}; + +fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-prepare-{label}-{}-{nanos}", + std::process::id() + )); + fs::create_dir_all(&dir).unwrap(); + dir +} + +fn request(root: PathBuf, cwd: PathBuf, cache_dir: PathBuf) -> PrepareRequest { + PrepareRequest { + root, + cwd, + cache_dir, + input_file: None, + input_format: None, + reference_file: None, + } +} + +#[test] +fn relative_input_path_resolves_inside_root() { + let root = temp_dir("relative-input-root"); + let cwd = temp_dir("relative-input-cwd"); + fs::write( + root.join("sample.txt"), + "rsid\tchromosome\tposition\tgenotype\n", + ) + .unwrap(); + + let mut req = request(root.clone(), cwd.clone(), PathBuf::from("cache")); + req.input_file = Some("sample.txt".to_owned()); + + let prepared = prepare_indexes(&req).unwrap(); + + let expected_input = root.join("sample.txt").canonicalize().unwrap(); + assert_eq!( + prepared.input_file.as_deref(), + Some(expected_input.as_path()) + ); + assert_eq!(prepared.input_index, None); + assert_eq!(prepared.cache_dir, cwd.join("cache")); +} + +#[test] +fn relative_path_escape_is_rejected() { + let root = temp_dir("relative-escape-root"); + let cwd = temp_dir("relative-escape-cwd"); + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("../outside.txt".to_owned()); + + let err = prepare_indexes(&req).unwrap_err(); + + assert!(err.contains("path escapes bioscript root"), "{err}"); +} + +#[test] +fn absolute_path_outside_root_is_rejected() { + let root = temp_dir("absolute-escape-root"); + let cwd = temp_dir("absolute-escape-cwd"); + let outside = temp_dir("absolute-escape-outside").join("sample.txt"); + fs::write(&outside, "rsid\tchromosome\tposition\tgenotype\n").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some(outside.to_string_lossy().into_owned()); + + let err = prepare_indexes(&req).unwrap_err(); + + assert!(err.contains("path escapes bioscript root"), "{err}"); +} + +#[test] +fn absolute_cache_dir_is_preserved() { + let root = temp_dir("absolute-cache-root"); + let cwd = temp_dir("absolute-cache-cwd"); + let cache = temp_dir("absolute-cache-target"); + + let req = request(root, cwd, cache.clone()); + let prepared = prepare_indexes(&req).unwrap(); + + assert_eq!(prepared.cache_dir, cache); +} + +#[test] +fn adjacent_cram_index_is_detected_without_rebuilding() { + let root = temp_dir("adjacent-cram-root"); + let cwd = temp_dir("adjacent-cram-cwd"); + let input_path = root.join("sample.cram"); + let index_path = root.join("sample.cram.crai"); + fs::write(&input_path, b"not a real cram").unwrap(); + fs::write(&index_path, b"not a real crai").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("sample.cram".to_owned()); + + let prepared = prepare_indexes(&req).unwrap(); + + let expected_index = index_path.canonicalize().unwrap(); + assert_eq!( + prepared.input_index.as_deref(), + Some(expected_index.as_path()) + ); +} + +#[test] +fn bam_without_adjacent_index_returns_clear_error() { + let root = temp_dir("bam-root"); + let cwd = temp_dir("bam-cwd"); + fs::write(root.join("sample.bam"), b"not a real bam").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("sample.bam".to_owned()); + + let err = prepare_indexes(&req).unwrap_err(); + + assert!( + err.contains("alignment indexing only supports CRAM"), + "{err}" + ); +} + +#[test] +fn adjacent_fasta_index_is_detected() { + let root = temp_dir("adjacent-fasta-root"); + let cwd = temp_dir("adjacent-fasta-cwd"); + let fasta = root.join("ref.fa"); + let fai = root.join("ref.fa.fai"); + fs::write(&fasta, b">chr1\nACGT\n").unwrap(); + fs::write(&fai, b"chr1\t4\t6\t4\t5\n").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.reference_file = Some("ref.fa".to_owned()); + + let prepared = prepare_indexes(&req).unwrap(); + + let expected_fasta = fasta.canonicalize().unwrap(); + let expected_fai = fai.canonicalize().unwrap(); + assert_eq!( + prepared.reference_file.as_deref(), + Some(expected_fasta.as_path()) + ); + assert_eq!( + prepared.reference_index.as_deref(), + Some(expected_fai.as_path()) + ); +} + +#[test] +fn fasta_index_is_generated_in_cache_when_missing() { + let root = temp_dir("generated-fasta-root"); + let cwd = temp_dir("generated-fasta-cwd"); + let cache = cwd.join("cache"); + fs::write(root.join("ref.fa"), b">chr1\nACGT\n").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.reference_file = Some("ref.fa".to_owned()); + + let prepared = prepare_indexes(&req).unwrap(); + let cached_reference = prepared.reference_file.expect("cached reference"); + let cached_index = prepared.reference_index.expect("cached reference index"); + + assert!(cached_reference.starts_with(&cache)); + assert!(cached_reference.exists()); + assert!(cached_index.starts_with(&cache)); + assert!(cached_index.exists()); +} + +#[test] +fn shell_flags_quote_paths_with_spaces_and_single_quotes() { + let prepared = PreparedPaths { + input_file: Some(PathBuf::from("/tmp/input files/sample's.cram")), + input_index: Some(PathBuf::from("/tmp/input files/sample.cram.crai")), + reference_file: Some(PathBuf::from("/tmp/ref files/ref's.fa")), + reference_index: Some(PathBuf::from("/tmp/ref files/ref.fa.fai")), + cache_dir: PathBuf::from("/tmp/cache"), + }; + + let flags = shell_flags(&prepared); + + assert!(flags.contains("--input-file '/tmp/input files/sample'\"'\"'s.cram'")); + assert!(flags.contains("--input-index '/tmp/input files/sample.cram.crai'")); + assert!(flags.contains("--reference-file '/tmp/ref files/ref'\"'\"'s.fa'")); + assert!(flags.contains("--reference-index '/tmp/ref files/ref.fa.fai'")); +} + +#[test] +fn explicit_cram_format_triggers_index_detection_for_non_cram_extension() { + let root = temp_dir("forced-cram-root"); + let cwd = temp_dir("forced-cram-cwd"); + let input = root.join("sample.dat"); + let crai = root.join("sample.crai"); + fs::write(&input, b"not a real cram").unwrap(); + fs::write(&crai, b"not a real crai").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("sample.dat".to_owned()); + req.input_format = Some(GenotypeSourceFormat::Cram); + + let err = prepare_indexes(&req).unwrap_err(); + + assert!( + err.contains("alignment indexing only supports CRAM") + || err.contains("failed to build alignment index"), + "{err}" + ); +} From b558117f04faf1f048fcf1870e9d679cc7c149b7 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 11:22:48 +1000 Subject: [PATCH 2/9] improving test coverage --- .github/workflows/coverage.yml | 2 +- .gitignore | 2 + coverage.sh | 21 ++- rust/bioscript-cli/tests/cli.rs | 141 +++++++++++++++ rust/bioscript-core/src/variant.rs | 54 ++++++ rust/bioscript-formats/src/prepare.rs | 6 +- rust/bioscript-formats/tests/file_formats.rs | 170 +++++++++++++++++- rust/bioscript-formats/tests/inspect.rs | 84 ++++++++- rust/bioscript-formats/tests/prepare.rs | 84 +++++++++ .../tests/resources_coverage.rs | 91 ++++++++++ rust/bioscript-runtime/tests/security.rs | 92 +++++++++- .../tests/validate_variants.rs | 151 ++++++++++++++++ rust/bioscript-wasm/src/lib.rs | 82 +++++---- 13 files changed, 936 insertions(+), 44 deletions(-) create mode 100644 rust/bioscript-runtime/tests/resources_coverage.rs diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 8fe6bf0..4ed7f47 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - test: [file_formats, inspect, prepare, cli] + test: [file_formats, inspect, prepare, cli, schema, core, runtime_security, runtime_resources] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index d97da61..2bf5fcc 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,8 @@ examples/herc2/herc2_*.tsv examples/herc2/classify_herc2_*.py rust/target/ bioscripts/output/ +cache/ +.bioscript-cache/ test-data/ test-reports/ local.sh diff --git a/coverage.sh b/coverage.sh index d2e6c0c..1832e5d 100755 --- a/coverage.sh +++ b/coverage.sh @@ -24,7 +24,8 @@ Usage: ./coverage.sh [--full-clean|-c] [--open] [--large] [--all-tests] [--no-li --all-tests Run all tests for the first-party BioScript crates --no-lint Skip cargo fmt and clippy checks --focused-test Run one focused integration test target: - file_formats, inspect, prepare, or cli + file_formats, inspect, prepare, cli, schema, core, runtime_security, + or runtime_resources Environment: AUTO_INSTALL_LLVM_COV=0 Do not auto-install cargo-llvm-cov @@ -96,7 +97,7 @@ TEST_RUSTFLAGS="${RUSTFLAGS:-} -Aunused-assignments -Amissing-docs" if [[ "$NO_LINT_FLAG" != "1" ]]; then echo "==> Formatting and linting" cargo fmt --check "${PKG_ARGS[@]}" - cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings + RUSTFLAGS="$TEST_RUSTFLAGS" cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings fi echo "==> Checking cargo-llvm-cov availability" @@ -168,6 +169,18 @@ if [[ -n "$FOCUSED_TEST" ]]; then cli) env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-cli --test cli --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" ;; + schema) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-schema --test validate_variants --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + ;; + core) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-core --lib --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG + ;; + runtime_security) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-runtime --test security --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + ;; + runtime_resources) + env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-runtime --test resources_coverage --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + ;; *) echo "Unknown focused test target: $FOCUSED_TEST" >&2 usage >&2 @@ -181,6 +194,10 @@ else env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test inspect -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test prepare -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --test cli -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-schema --test validate_variants -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test security -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test resources_coverage -- --nocapture --test-threads="$TEST_THREADS" cargo llvm-cov report "${PKG_ARGS[@]}" --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG fi diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 5936b1d..4e8daf0 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -96,6 +96,33 @@ fn trace_report_is_written_for_hello_world() { assert!(trace.contains("hello from bioscript")); } +#[test] +fn timing_report_is_written_for_hello_world() { + let root = repo_root(); + let timing_path = root.join("bioscripts/output/hello-world.timing.tsv"); + if timing_path.exists() { + fs::remove_file(&timing_path).unwrap(); + } + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--timing-report") + .arg("bioscripts/output/hello-world.timing.tsv") + .arg("bioscripts/hello-world.py") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let timing = fs::read_to_string(timing_path).unwrap(); + assert!(timing.contains("stage\tduration_ms\tdetail")); + assert!(timing.contains("run_file_total\t")); + assert!(timing.contains("script=bioscripts/hello-world.py")); +} + #[test] fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { let root = repo_root(); @@ -171,6 +198,120 @@ fn inspect_subcommand_reports_detected_vendor_and_platform() { assert!(stdout.contains("duration_ms\t")); } +#[test] +fn prepare_subcommand_reports_reference_index_flags() { + let root = repo_root(); + let dir = temp_dir("prepare-cli"); + let reference = dir.join("ref.fa"); + fs::write(&reference, b">chr1\nACGT\n").unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("prepare") + .arg("--root") + .arg(&dir) + .arg("--reference-file") + .arg("ref.fa") + .arg("--cache-dir") + .arg("cache") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("--reference-file")); + assert!(stdout.contains("--reference-index")); + assert!(stdout.contains("cache")); +} + +#[test] +fn validate_variants_cli_returns_nonzero_and_writes_report() { + let root = repo_root(); + let dir = temp_dir("validate-variants-cli"); + let manifest = dir.join("bad-variant.yaml"); + let report = dir.join("reports/variants.txt"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant" +version: "1.0" +variant_id: "TEST_bad" +name: "bad" +identifiers: + rsids: + - "bad-rsid" +coordinates: + grch38: + chrom: "chrUn" + pos: 0 +alleles: + kind: "snv" + ref: "AA" + alts: [] +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("validate-variants") + .arg(&manifest) + .arg("--report") + .arg(&report) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("validation found"), "{stderr}"); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("errors"), "{stdout}"); + let report_text = fs::read_to_string(report).unwrap(); + assert!(report_text.contains("bad-rsid")); +} + +#[test] +fn validate_panels_cli_returns_nonzero_and_writes_report() { + let root = repo_root(); + let dir = temp_dir("validate-panels-cli"); + let panel = dir.join("bad-panel.yaml"); + let report = dir.join("reports/panels.txt"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "bad-panel" +members: + - kind: "variant" + path: "../outside.yaml" + sha256: "not-a-sha" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("validate-panels") + .arg(&panel) + .arg("--report") + .arg(&report) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("validation found"), "{stderr}"); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("errors"), "{stdout}"); + let report_text = fs::read_to_string(report).unwrap(); + assert!(report_text.contains("members[0].sha256")); +} + #[test] fn variant_manifest_runs_directly_via_cli() { let root = repo_root(); diff --git a/rust/bioscript-core/src/variant.rs b/rust/bioscript-core/src/variant.rs index e31622d..5d333ef 100644 --- a/rust/bioscript-core/src/variant.rs +++ b/rust/bioscript-core/src/variant.rs @@ -59,3 +59,57 @@ impl VariantSpec { self.grch37.is_some() || self.grch38.is_some() } } + +#[cfg(test)] +mod tests { + use super::{GenomicLocus, VariantSpec}; + + #[test] + fn default_variant_spec_has_no_lookup_keys() { + let spec = VariantSpec::default(); + + assert!(!spec.has_rsids()); + assert!(!spec.has_coordinates()); + } + + #[test] + fn variant_spec_reports_rsid_lookup_keys() { + let spec = VariantSpec { + rsids: vec!["rs73885319".to_owned()], + ..VariantSpec::default() + }; + + assert!(spec.has_rsids()); + assert!(!spec.has_coordinates()); + } + + #[test] + fn variant_spec_reports_grch37_coordinates() { + let spec = VariantSpec { + grch37: Some(GenomicLocus { + chrom: "22".to_owned(), + start: 36_265_860, + end: 36_265_861, + }), + ..VariantSpec::default() + }; + + assert!(!spec.has_rsids()); + assert!(spec.has_coordinates()); + } + + #[test] + fn variant_spec_reports_grch38_coordinates() { + let spec = VariantSpec { + grch38: Some(GenomicLocus { + chrom: "22".to_owned(), + start: 36_265_860, + end: 36_265_861, + }), + ..VariantSpec::default() + }; + + assert!(!spec.has_rsids()); + assert!(spec.has_coordinates()); + } +} diff --git a/rust/bioscript-formats/src/prepare.rs b/rust/bioscript-formats/src/prepare.rs index 3af9ca6..3d62a0d 100644 --- a/rust/bioscript-formats/src/prepare.rs +++ b/rust/bioscript-formats/src/prepare.rs @@ -83,8 +83,9 @@ fn canonical_dir(path: &Path) -> Result { fn resolve_rooted_path(root: &Path, raw: &str) -> Result { let raw_path = Path::new(raw); let resolved = if raw_path.is_absolute() { - ensure_path_within_root(root, raw_path)?; - raw_path.to_path_buf() + raw_path + .canonicalize() + .map_err(|err| format!("failed to resolve {}: {err}", raw_path.display()))? } else { ensure_relative_path_safe(raw_path)?; root.join(raw_path) @@ -92,6 +93,7 @@ fn resolve_rooted_path(root: &Path, raw: &str) -> Result { let canonical = resolved .canonicalize() .map_err(|err| format!("failed to resolve {}: {err}", resolved.display()))?; + ensure_path_within_root(root, &canonical)?; Ok(canonical) } diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index 300b4d2..e23add1 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -6,7 +6,7 @@ use std::{ }; use bioscript_core::{VariantKind, VariantSpec}; -use bioscript_formats::{GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore}; +use bioscript_formats::{GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, alignment}; use zip::write::SimpleFileOptions; fn temp_dir(label: &str) -> PathBuf { @@ -60,6 +60,174 @@ fn shared_fixture_or_skip(test_name: &str, relative: &str) -> Option { Some(path) } +fn zip_bytes(entry_name: &str, contents: &[u8]) -> Vec { + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .start_file(entry_name, SimpleFileOptions::default()) + .unwrap(); + writer.write_all(contents).unwrap(); + writer.finish().unwrap().into_inner() +} + +#[test] +fn genotype_store_from_bytes_handles_genotype_text() { + let store = GenotypeStore::from_bytes( + "sample.txt", + b"\xef\xbb\xbfrsid\tchromosome\tposition\tgenotype\n\ + # skipped comment\n\ + rs73885319\t22\t36265860\tag\n\ + rs60910145\t22\t36265900\tN/A\n", + ) + .unwrap(); + + assert_eq!(store.backend_name(), "text"); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); +} + +#[test] +fn genotype_store_from_bytes_handles_vcf() { + let store = GenotypeStore::from_bytes( + "sample.vcf", + b"##fileformat=VCFv4.2\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + assert_eq!(store.backend_name(), "vcf"); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn genotype_store_from_bytes_handles_zip() { + let bytes = zip_bytes( + "nested/sample.txt", + b"rsid\tchromosome\tposition\tgenotype\nrs73885319\t22\t36265860\tAG\n", + ); + + let store = GenotypeStore::from_bytes("sample.zip", &bytes).unwrap(); + + assert_eq!(store.backend_name(), "zip"); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn genotype_store_from_bytes_rejects_malformed_zip() { + let err = GenotypeStore::from_bytes("sample.zip", b"not a zip").unwrap_err(); + + assert!( + format!("{err:?}").contains("failed to read genotype zip sample.zip"), + "{err:?}" + ); +} + +#[test] +fn genotype_store_from_bytes_rejects_zip_without_supported_entry() { + let bytes = zip_bytes("notes.bin", b"not genotype data"); + + let err = GenotypeStore::from_bytes("sample.zip", &bytes).unwrap_err(); + + assert!( + format!("{err:?}") + .contains("zip archive sample.zip does not contain a supported genotype file"), + "{err:?}" + ); +} + +#[test] +fn alignment_index_parsers_handle_in_memory_bytes() { + let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); + let _repository = alignment::build_reference_repository_from_readers( + std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), + fai, + ); + + let err = alignment::parse_fai_bytes(b"not a fai").unwrap_err(); + assert!(format!("{err:?}").contains("failed to parse FASTA index bytes")); + + let err = alignment::parse_crai_bytes(b"not a crai").unwrap_err(); + assert!(format!("{err:?}").contains("failed to parse CRAM index bytes")); + + let err = alignment::parse_tbi_bytes(b"not a tbi").unwrap_err(); + assert!(format!("{err:?}").contains("failed to parse tabix index bytes")); +} + +#[test] +fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { + let dir = temp_dir("csv-split-alleles"); + let path = dir.join("sample.csv"); + fs::write( + &path, + "\n\ + # rsid,chromosome,position,allele1,allele2\n\ + // ignored comment\n\ + rs73885319,chr22,36265860,a,g\n\ + rs60910145,22,36265900,n/a,\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); + + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "22".to_owned(), + start: 36_265_860, + end: 36_265_861, + }), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(observation.genotype.as_deref(), Some("AG")); + assert_eq!( + observation.evidence, + vec!["resolved by locus chr22:36265860".to_owned()] + ); +} + +#[test] +fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { + let dir = temp_dir("vcf-chr-normalize"); + let path = dir.join("sample.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=GRCh38\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + chr1\t1000\t.\tA\tC,G\t.\tPASS\t.\tGT\t2/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 1000, + end: 1001, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap(); + + assert_eq!(observation.genotype.as_deref(), Some("GC")); + assert_eq!(observation.assembly, Some(bioscript_core::Assembly::Grch38)); + assert_eq!( + observation.evidence, + vec!["resolved by locus chr1:1000".to_owned()] + ); +} + #[test] fn zip_genotype_file_is_auto_detected_and_readable() { let dir = temp_dir("zip-auto"); diff --git a/rust/bioscript-formats/tests/inspect.rs b/rust/bioscript-formats/tests/inspect.rs index 185ccce..717444d 100644 --- a/rust/bioscript-formats/tests/inspect.rs +++ b/rust/bioscript-formats/tests/inspect.rs @@ -1,11 +1,13 @@ use std::{ env, + io::Write as _, path::PathBuf, time::{Instant, SystemTime, UNIX_EPOCH}, }; use bioscript_core::Assembly; -use bioscript_formats::{DetectedKind, FileContainer, InspectOptions, inspect_file}; +use bioscript_formats::{DetectedKind, FileContainer, InspectOptions, inspect_bytes, inspect_file}; +use zip::write::SimpleFileOptions; fn repo_root() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -62,6 +64,86 @@ fn temp_dir(label: &str) -> PathBuf { dir } +fn zip_bytes(entry_name: &str, contents: &[u8]) -> Vec { + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .start_file(entry_name, SimpleFileOptions::default()) + .unwrap(); + writer.write_all(contents).unwrap(); + writer.finish().unwrap().into_inner() +} + +#[test] +fn inspect_bytes_handles_genotype_text() { + let inspection = inspect_bytes( + "sample.txt", + b"rsid\tchromosome\tposition\tgenotype\n\ + rs73885319\t22\t36265860\tAG\n\ + rs60910145\t22\t36265900\tTG\n\ + rs71785313\t22\t36266005\tII\n", + &InspectOptions::default(), + ) + .unwrap(); + + assert_eq!(inspection.container, FileContainer::Plain); + assert_eq!(inspection.detected_kind, DetectedKind::GenotypeText); +} + +#[test] +fn inspect_bytes_handles_vcf() { + let inspection = inspect_bytes( + "sample.vcf", + b"##fileformat=VCFv4.2\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + &InspectOptions::default(), + ) + .unwrap(); + + assert_eq!(inspection.container, FileContainer::Plain); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.phased, Some(false)); +} + +#[test] +fn inspect_bytes_handles_zip() { + let bytes = zip_bytes( + "nested/sample.txt", + b"rsid\tchromosome\tposition\tgenotype\n\ + rs73885319\t22\t36265860\tAG\n\ + rs60910145\t22\t36265900\tTG\n\ + rs71785313\t22\t36266005\tII\n", + ); + + let inspection = inspect_bytes("sample.zip", &bytes, &InspectOptions::default()).unwrap(); + + assert_eq!(inspection.container, FileContainer::Zip); + assert_eq!(inspection.detected_kind, DetectedKind::GenotypeText); + assert_eq!( + inspection.selected_entry.as_deref(), + Some("nested/sample.txt") + ); +} + +#[test] +fn inspect_bytes_handles_unknown_bytes_conservatively() { + let inspection = inspect_bytes( + "sample.bin", + b"this is not recognizable genotype or alignment data\n", + &InspectOptions::default(), + ) + .unwrap(); + + assert_eq!(inspection.container, FileContainer::Plain); + assert_eq!(inspection.detected_kind, DetectedKind::Unknown); + assert!( + inspection + .warnings + .contains(&"file did not match known textual heuristics".to_owned()) + ); +} + #[test] fn ancestrydna_text_fixture_reports_vendor_platform_and_build() { let path = fixtures_dir().join("ancestrydna_v2_sample.txt"); diff --git a/rust/bioscript-formats/tests/prepare.rs b/rust/bioscript-formats/tests/prepare.rs index 4b99713..63d86c0 100644 --- a/rust/bioscript-formats/tests/prepare.rs +++ b/rust/bioscript-formats/tests/prepare.rs @@ -83,6 +83,38 @@ fn absolute_path_outside_root_is_rejected() { assert!(err.contains("path escapes bioscript root"), "{err}"); } +#[test] +fn absolute_path_inside_root_is_allowed() { + let root = temp_dir("absolute-inside-root"); + let cwd = temp_dir("absolute-inside-cwd"); + let input = root.join("sample.txt"); + fs::write(&input, "rsid\tchromosome\tposition\tgenotype\n").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some(input.to_string_lossy().into_owned()); + + let prepared = prepare_indexes(&req).unwrap(); + + assert_eq!( + prepared.input_file.as_deref(), + Some(input.canonicalize().unwrap().as_path()) + ); +} + +#[test] +fn missing_input_file_returns_clear_error() { + let root = temp_dir("missing-input-root"); + let cwd = temp_dir("missing-input-cwd"); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("missing.txt".to_owned()); + + let err = prepare_indexes(&req).unwrap_err(); + + assert!(err.contains("failed to resolve"), "{err}"); + assert!(err.contains("missing.txt"), "{err}"); +} + #[test] fn absolute_cache_dir_is_preserved() { let root = temp_dir("absolute-cache-root"); @@ -116,6 +148,26 @@ fn adjacent_cram_index_is_detected_without_rebuilding() { ); } +#[test] +fn adjacent_short_cram_index_is_detected_without_rebuilding() { + let root = temp_dir("adjacent-short-cram-root"); + let cwd = temp_dir("adjacent-short-cram-cwd"); + let input_path = root.join("sample.cram"); + let index_path = root.join("sample.crai"); + fs::write(&input_path, b"not a real cram").unwrap(); + fs::write(&index_path, b"not a real crai").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("sample.cram".to_owned()); + + let prepared = prepare_indexes(&req).unwrap(); + + assert_eq!( + prepared.input_index.as_deref(), + Some(index_path.canonicalize().unwrap().as_path()) + ); +} + #[test] fn bam_without_adjacent_index_returns_clear_error() { let root = temp_dir("bam-root"); @@ -179,6 +231,28 @@ fn fasta_index_is_generated_in_cache_when_missing() { assert!(cached_index.exists()); } +#[test] +fn fasta_without_extension_uses_fai_extension() { + let root = temp_dir("fasta-no-extension-root"); + let cwd = temp_dir("fasta-no-extension-cwd"); + let cache = cwd.join("cache"); + fs::write(root.join("reference"), b">chr1\nACGT\n").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.reference_file = Some("reference".to_owned()); + + let prepared = prepare_indexes(&req).unwrap(); + let cached_reference = prepared.reference_file.expect("cached reference"); + let cached_index = prepared.reference_index.expect("cached reference index"); + + assert!(cached_reference.starts_with(&cache)); + assert_eq!( + cached_index.extension().and_then(|ext| ext.to_str()), + Some("fai") + ); + assert!(cached_index.exists()); +} + #[test] fn shell_flags_quote_paths_with_spaces_and_single_quotes() { let prepared = PreparedPaths { @@ -197,6 +271,16 @@ fn shell_flags_quote_paths_with_spaces_and_single_quotes() { assert!(flags.contains("--reference-index '/tmp/ref files/ref.fa.fai'")); } +#[test] +fn shell_flags_are_empty_when_nothing_was_prepared() { + let prepared = PreparedPaths { + cache_dir: PathBuf::from("/tmp/cache"), + ..PreparedPaths::default() + }; + + assert_eq!(shell_flags(&prepared), ""); +} + #[test] fn explicit_cram_format_triggers_index_detection_for_non_cram_extension() { let root = temp_dir("forced-cram-root"); diff --git a/rust/bioscript-runtime/tests/resources_coverage.rs b/rust/bioscript-runtime/tests/resources_coverage.rs new file mode 100644 index 0000000..dec19d5 --- /dev/null +++ b/rust/bioscript-runtime/tests/resources_coverage.rs @@ -0,0 +1,91 @@ +use std::{ + fs, + path::PathBuf, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; + +use bioscript_formats::GenotypeLoadOptions; +use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; +use monty::ResourceLimits; + +fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-runtime-coverage-{label}-{}-{nanos}", + std::process::id() + )); + fs::create_dir_all(&dir).unwrap(); + dir +} + +fn run_script(code: &str, limits: ResourceLimits) -> Result<(), String> { + let dir = temp_dir("resources"); + let script = dir.join("script.py"); + fs::write(&script, code).unwrap(); + + let runtime = BioscriptRuntime::with_config( + &dir, + RuntimeConfig { + limits, + loader: GenotypeLoadOptions::default(), + }, + ) + .unwrap(); + + runtime + .run_file(&script, None, Vec::new()) + .map(|_| ()) + .map_err(|err| err.to_string()) +} + +#[test] +fn coverage_infinite_loop_times_out() { + let err = run_script( + "while True:\n pass\n", + ResourceLimits::new().max_duration(Duration::from_millis(10)), + ) + .unwrap_err(); + + assert!(err.contains("time limit exceeded"), "{err}"); +} + +#[test] +fn coverage_large_allocation_fails() { + let err = run_script( + "x = 'a' * 1_000_000\nprint(len(x))\n", + ResourceLimits::new().max_memory(65_536), + ) + .unwrap_err(); + + assert!(err.contains("memory limit exceeded"), "{err}"); +} + +#[test] +fn coverage_giant_string_amplification_fails() { + let err = run_script( + "text = 'a'\nwhile True:\n text = text + text\n", + ResourceLimits::new().max_memory(65_536), + ) + .unwrap_err(); + + assert!(err.contains("memory limit exceeded"), "{err}"); +} + +#[test] +fn coverage_giant_list_growth_fails() { + let err = run_script( + "items = []\nwhile True:\n items.append('x' * 1000)\n", + ResourceLimits::new() + .max_memory(65_536) + .max_duration(Duration::from_millis(50)), + ) + .unwrap_err(); + + assert!( + err.contains("memory limit exceeded") || err.contains("time limit exceeded"), + "{err}" + ); +} diff --git a/rust/bioscript-runtime/tests/security.rs b/rust/bioscript-runtime/tests/security.rs index 4d24c9e..312d683 100644 --- a/rust/bioscript-runtime/tests/security.rs +++ b/rust/bioscript-runtime/tests/security.rs @@ -6,6 +6,7 @@ use std::{ use bioscript_formats::GenotypeLoadOptions; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; +use monty::MontyObject; fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() @@ -40,6 +41,29 @@ fn run_script(code: &str) -> Result<(), String> { .map_err(|err| err.to_string()) } +fn run_script_with_inputs( + root: &PathBuf, + code: &str, + inputs: Vec<(&str, MontyObject)>, +) -> Result { + let script = root.join("script.py"); + fs::write(&script, code).unwrap(); + + let runtime = BioscriptRuntime::with_config( + root, + RuntimeConfig { + loader: GenotypeLoadOptions::default(), + ..RuntimeConfig::default() + }, + ) + .unwrap(); + + runtime + .run_file(&script, None, inputs) + .map(|_| runtime.clone()) + .map_err(|err| err.to_string()) +} + #[test] fn open_builtin_is_not_available() { let err = run_script("open('secret.txt')\n").unwrap_err(); @@ -49,7 +73,7 @@ fn open_builtin_is_not_available() { #[test] fn eval_builtin_is_not_available() { let err = run_script("eval('1 + 1')\n").unwrap_err(); - assert!(err.contains("unknown bioscript host function: eval")); + assert!(err.contains("eval"), "{err}"); } #[test] @@ -82,3 +106,69 @@ fn unsupported_networkish_import_fails() { let err = run_script("import urllib\n").unwrap_err(); assert!(err.contains("No module named 'urllib'")); } + +#[test] +fn host_read_write_text_allows_nested_safe_paths() { + let dir = temp_dir("nested-safe-paths"); + fs::create_dir_all(dir.join("inputs/nested")).unwrap(); + fs::write(dir.join("inputs/nested/source.txt"), "hello nested").unwrap(); + + run_script_with_inputs( + &dir, + r#" +def main(): + text = bioscript.read_text("inputs/nested/source.txt") + bioscript.write_text("outputs/nested/result.txt", text + " output") + +if __name__ == "__main__": + main() +"#, + Vec::new(), + ) + .unwrap(); + + let written = fs::read_to_string(dir.join("outputs/nested/result.txt")).unwrap(); + assert_eq!(written, "hello nested output"); +} + +#[test] +fn runtime_lookup_details_reports_missing_variant_and_no_call() { + let dir = temp_dir("lookup-missing-no-call"); + fs::write( + dir.join("genotypes.txt"), + "rsid\tchromosome\tposition\tgenotype\nrsNoCall\t1\t100\tN/A\n", + ) + .unwrap(); + + let runtime = run_script_with_inputs( + &dir, + r#" +MISSING = bioscript.variant(rsid="rsMissing") +NO_CALL = bioscript.variant(rsid="rsNoCall") + +def main(): + genotypes = bioscript.load_genotypes(input_file) + missing = genotypes.lookup_variant_details(MISSING) + no_call = genotypes.lookup_variant_details(NO_CALL) + bioscript.write_text("outputs/details.txt", str(missing) + "\n" + str(no_call)) + +if __name__ == "__main__": + main() +"#, + vec![( + "input_file", + MontyObject::String("genotypes.txt".to_owned()), + )], + ) + .unwrap(); + + let timings = runtime.timing_snapshot(); + assert!( + timings + .iter() + .any(|timing| timing.stage == "lookup_variant_details") + ); + let details = fs::read_to_string(dir.join("outputs/details.txt")).unwrap(); + assert!(details.contains("genotype=None"), "{details}"); + assert!(details.contains("genotype='--'"), "{details}"); +} diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index 25bd584..635dbc9 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -257,3 +257,154 @@ members: "https://github.com/madhavajay/exvitae/blob/main/assays/pgx/GLP1/variants/rs2048683.yaml" ); } + +#[test] +fn remote_resource_resolution_classifies_python_without_parsing() { + let text = "print('hello from remote bioscript')\n"; + + let resolved = resolve_remote_resource_text( + "https://github.com/OpenMined/bioscript/blob/main/example.py", + "example.py", + text, + ) + .unwrap(); + + assert_eq!(resolved.kind, RemoteResourceKind::Python); + assert_eq!(resolved.schema, None); + assert_eq!(resolved.title, "example.py"); + assert_eq!( + resolved.sha256, + "b6d9c1ee20c7fb054ebd7defd271d7956b25d8d0c3ef451eaf6adcfda8a61b0f" + ); +} + +#[test] +fn remote_resource_resolution_classifies_schema_kinds() { + let cases = [ + ( + "variant.yaml", + "bioscript:variant:1.0", + RemoteResourceKind::Variant, + ), + ( + "panel.yaml", + "bioscript:panel:1.0", + RemoteResourceKind::Panel, + ), + ( + "catalogue.yaml", + "bioscript:catalogue:1.0", + RemoteResourceKind::Catalogue, + ), + ( + "assay.yaml", + "bioscript:assay:1.0", + RemoteResourceKind::Assay, + ), + ]; + + for (name, schema, expected) in cases { + let text = format!( + r#" +schema: "{schema}" +version: "1.0" +name: "{name}" +"# + ); + + let resolved = + resolve_remote_resource_text("https://example.com/resources/index.yaml", name, &text) + .unwrap(); + + assert_eq!(resolved.kind, expected, "{name}"); + assert_eq!(resolved.schema.as_deref(), Some(schema), "{name}"); + } +} + +#[test] +fn remote_resource_resolution_infers_kind_from_fields_without_schema() { + let cases = [ + ("members.yaml", "members: []\n", RemoteResourceKind::Panel), + ("variants.yaml", "variants: []\n", RemoteResourceKind::Panel), + ( + "catalogue.yaml", + "assays: []\n", + RemoteResourceKind::Catalogue, + ), + ( + "assay.yaml", + "assay:\n package_version: \"2026.1\"\n", + RemoteResourceKind::Assay, + ), + ( + "variant.yaml", + "variant_id: TEST_rs1\ncoordinates: {}\n", + RemoteResourceKind::Variant, + ), + ( + "unknown.yaml", + "name: just-a-file\n", + RemoteResourceKind::Unknown, + ), + ]; + + for (name, text, expected) in cases { + let resolved = + resolve_remote_resource_text("https://example.com/resources/index.yaml", name, text) + .unwrap(); + + assert_eq!(resolved.kind, expected, "{name}"); + } +} + +#[test] +fn remote_resource_resolution_resolves_github_dependencies_and_dedupes_urls() { + let text = r#" +schema: "bioscript:panel:1.0" +version: "1.0" +members: + - path: "variants/rs1.yaml" + version: "1.1" + - path: "variants/rs1.yaml" + version: "1.1" + - path: "/shared/rs2.yaml" +downloads: + - url: "https://example.com/reference.json" +"#; + + let resolved = resolve_remote_resource_text( + "https://github.com/OpenMined/bioscript/blob/main/panels/panel.yaml", + "panel.yaml", + text, + ) + .unwrap(); + + let urls = resolved + .dependencies + .iter() + .map(|dependency| dependency.url.as_str()) + .collect::>(); + + assert_eq!( + urls, + vec![ + "https://example.com/reference.json", + "https://github.com/OpenMined/bioscript/blob/main/panels/variants/rs1.yaml", + "https://github.com/OpenMined/bioscript/blob/main/shared/rs2.yaml", + ] + ); + assert_eq!(resolved.dependencies[0].kind, "download"); + assert_eq!(resolved.dependencies[1].kind, "member"); + assert_eq!(resolved.dependencies[1].version.as_deref(), Some("1.1")); +} + +#[test] +fn remote_resource_resolution_reports_invalid_structured_text() { + let err = resolve_remote_resource_text("https://example.com/bad.yaml", "bad.yaml", ":\n") + .unwrap_err(); + + assert!( + err.contains("failed to parse YAML resource bad.yaml"), + "{err}" + ); +} diff --git a/rust/bioscript-wasm/src/lib.rs b/rust/bioscript-wasm/src/lib.rs index c94f6d0..563c150 100644 --- a/rust/bioscript-wasm/src/lib.rs +++ b/rust/bioscript-wasm/src/lib.rs @@ -21,9 +21,8 @@ use std::{io::BufReader, path::PathBuf}; use bioscript_core::{GenomicLocus, VariantKind, VariantObservation, VariantSpec}; use bioscript_formats::{ alignment, inspect_bytes as inspect_bytes_rs, observe_cram_indel_with_reader, - observe_cram_snp_with_reader, - observe_vcf_snp_with_reader, DetectedKind, DetectionConfidence, FileContainer, FileInspection, - GenotypeStore, InspectOptions, SourceMetadata, + observe_cram_snp_with_reader, observe_vcf_snp_with_reader, DetectedKind, DetectionConfidence, + FileContainer, FileInspection, GenotypeStore, InspectOptions, SourceMetadata, }; use bioscript_schema::{ load_variant_manifest_text_for_lookup, @@ -272,42 +271,43 @@ pub fn lookup_cram_variants( end, }; let kind = parse_variant_kind(variant.kind.as_deref()).unwrap_or(VariantKind::Snp); - let observation = match kind { - VariantKind::Snp => { - ensure_single_base_variant(&variant)?; - let ref_char = variant.ref_base.chars().next().ok_or_else(|| { - JsError::new(&format!("variant {}: empty ref", variant.name)) - })?; - let alt_char = variant.alt_base.chars().next().ok_or_else(|| { - JsError::new(&format!("variant {}: empty alt", variant.name)) - })?; - observe_cram_snp_with_reader( + let observation = + match kind { + VariantKind::Snp => { + ensure_single_base_variant(&variant)?; + let ref_char = variant.ref_base.chars().next().ok_or_else(|| { + JsError::new(&format!("variant {}: empty ref", variant.name)) + })?; + let alt_char = variant.alt_base.chars().next().ok_or_else(|| { + JsError::new(&format!("variant {}: empty alt", variant.name)) + })?; + observe_cram_snp_with_reader( + &mut indexed, + &variant.name, + &locus, + ref_char, + alt_char, + variant.rsid.clone(), + assembly, + ) + } + VariantKind::Insertion | VariantKind::Indel => observe_cram_indel_with_reader( &mut indexed, &variant.name, &locus, - ref_char, - alt_char, + &variant.ref_base, + &variant.alt_base, variant.rsid.clone(), assembly, - ) + ), + other => { + return Err(JsError::new(&format!( + "variant {} has unsupported kind {:?} for web CRAM lookup", + variant.name, other + ))); + } } - VariantKind::Insertion | VariantKind::Indel => observe_cram_indel_with_reader( - &mut indexed, - &variant.name, - &locus, - &variant.ref_base, - &variant.alt_base, - variant.rsid.clone(), - assembly, - ), - other => { - return Err(JsError::new(&format!( - "variant {} has unsupported kind {:?} for web CRAM lookup", - variant.name, other - ))); - } - } - .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", variant.name)))?; + .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", variant.name)))?; results.push(VariantObservationJs { name: variant.name, backend: observation.backend, @@ -429,9 +429,16 @@ pub fn lookup_genotype_bytes_variants( } fn ensure_single_base_variant(variant: &VariantInput) -> Result<(), JsError> { - let kind = variant.kind.as_deref().unwrap_or("snv").to_ascii_lowercase(); + let kind = variant + .kind + .as_deref() + .unwrap_or("snv") + .to_ascii_lowercase(); let is_snp_kind = matches!(kind.as_str(), "snp" | "snv" | "variant" | ""); - if !is_snp_kind || variant.ref_base.chars().count() != 1 || variant.alt_base.chars().count() != 1 { + if !is_snp_kind + || variant.ref_base.chars().count() != 1 + || variant.alt_base.chars().count() != 1 + { return Err(JsError::new(&format!( "variant {} has kind/ref/alt {} {}/{}; web CRAM/VCF lookup currently supports single-base SNV observations only", variant.name, @@ -487,7 +494,10 @@ fn parse_variant_kind(kind: Option<&str>) -> Option { } } -fn observation_to_js(variant: VariantInput, observation: VariantObservation) -> VariantObservationJs { +fn observation_to_js( + variant: VariantInput, + observation: VariantObservation, +) -> VariantObservationJs { VariantObservationJs { name: variant.name, backend: observation.backend, From 4178bc0eecec9e070a3324e3f89e55beb2e58faf Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 14:15:35 +1000 Subject: [PATCH 3/9] remove duplicate test running --- .github/workflows/ci.yml | 9 +- coverage.sh | 22 +- rust/bioscript-formats/tests/file_formats.rs | 191 +++++++++++++++++- .../tests/validate_variants.rs | 138 ++++++++++++- 4 files changed, 340 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9fc4448..13e1af9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,8 @@ permissions: contents: read jobs: - lint-and-test: + lint: + name: lint runs-on: ubuntu-latest steps: - name: Checkout @@ -29,9 +30,3 @@ jobs: - name: Lint (cargo fmt + clippy) run: ./lint.sh - - - name: Test (cargo test) - run: ./test.sh - - - name: Format integration tests (CRAM fixtures, MD5 fallback) - run: cargo test --manifest-path rust/Cargo.toml -p bioscript-formats --test file_formats -- --test-threads=1 diff --git a/coverage.sh b/coverage.sh index 1832e5d..9cb9d94 100755 --- a/coverage.sh +++ b/coverage.sh @@ -158,28 +158,28 @@ echo "==> Running coverage" if [[ -n "$FOCUSED_TEST" ]]; then case "$FOCUSED_TEST" in file_formats) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-formats --test file_formats --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test file_formats -- --nocapture --test-threads="$TEST_THREADS" ;; inspect) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-formats --test inspect --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test inspect -- --nocapture --test-threads="$TEST_THREADS" ;; prepare) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-formats --test prepare --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test prepare -- --nocapture --test-threads="$TEST_THREADS" ;; cli) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-cli --test cli --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --test cli -- --nocapture --test-threads="$TEST_THREADS" ;; schema) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-schema --test validate_variants --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-schema --test validate_variants -- --nocapture --test-threads="$TEST_THREADS" ;; core) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-core --lib --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib ;; runtime_security) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-runtime --test security --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test security -- --nocapture --test-threads="$TEST_THREADS" ;; runtime_resources) - env "${COV_ENV[@]}" cargo llvm-cov -p bioscript-runtime --test resources_coverage --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test resources_coverage -- --nocapture --test-threads="$TEST_THREADS" ;; *) echo "Unknown focused test target: $FOCUSED_TEST" >&2 @@ -188,7 +188,7 @@ if [[ -n "$FOCUSED_TEST" ]]; then ;; esac elif [[ "$ALL_TESTS_FLAG" == "1" ]]; then - env "${COV_ENV[@]}" cargo llvm-cov "${PKG_ARGS[@]}" --all-targets --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG + env "${COV_ENV[@]}" cargo llvm-cov --no-report "${PKG_ARGS[@]}" --all-targets else env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test file_formats -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test inspect -- --nocapture --test-threads="$TEST_THREADS" @@ -198,9 +198,11 @@ else env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test security -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test resources_coverage -- --nocapture --test-threads="$TEST_THREADS" - cargo llvm-cov report "${PKG_ARGS[@]}" --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG fi +echo "==> Generating HTML report" +cargo llvm-cov report "${PKG_ARGS[@]}" --html --ignore-filename-regex "$IGNORE_REGEX" $OPEN_FLAG + echo "==> Exporting LCOV" cargo llvm-cov report "${PKG_ARGS[@]}" --ignore-filename-regex "$IGNORE_REGEX" --lcov --output-path "$LCOV_OUT" diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index e23add1..b38bc38 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -6,7 +6,9 @@ use std::{ }; use bioscript_core::{VariantKind, VariantSpec}; -use bioscript_formats::{GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, alignment}; +use bioscript_formats::{ + GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, alignment, +}; use zip::write::SimpleFileOptions; fn temp_dir(label: &str) -> PathBuf { @@ -86,6 +88,82 @@ fn genotype_store_from_bytes_handles_genotype_text() { assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); } +#[test] +fn genotype_source_format_parses_supported_values_and_rejects_unknowns() { + assert_eq!( + "txt".parse::().unwrap(), + GenotypeSourceFormat::Text + ); + assert_eq!( + "GENOTYPE".parse::().unwrap(), + GenotypeSourceFormat::Text + ); + assert_eq!( + "zip".parse::().unwrap(), + GenotypeSourceFormat::Zip + ); + assert_eq!( + "vcf".parse::().unwrap(), + GenotypeSourceFormat::Vcf + ); + assert_eq!( + "cram".parse::().unwrap(), + GenotypeSourceFormat::Cram + ); + + let err = "bam".parse::().unwrap_err(); + assert_eq!(err, "unsupported input format: bam"); +} + +#[test] +fn backend_capabilities_match_query_backend_type() { + let rsid_map = GenotypeStore::from_bytes( + "sample.txt", + b"rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + assert_eq!(rsid_map.backend_name(), "text"); + assert!(rsid_map.supports(QueryKind::GenotypeByRsid)); + assert!(!rsid_map.supports(QueryKind::GenotypeByLocus)); + + let dir = temp_dir("backend-capabilities"); + let text_path = dir.join("sample.txt"); + fs::write( + &text_path, + "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + let delimited = GenotypeStore::from_file(&text_path).unwrap(); + assert_eq!(delimited.backend_name(), "text"); + assert!(delimited.supports(QueryKind::GenotypeByRsid)); + assert!(delimited.supports(QueryKind::GenotypeByLocus)); + + let vcf_path = dir.join("sample.vcf"); + fs::write( + &vcf_path, + "##fileformat=VCFv4.2\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + let vcf = GenotypeStore::from_file(&vcf_path).unwrap(); + assert_eq!(vcf.backend_name(), "vcf"); + assert!(vcf.supports(QueryKind::GenotypeByRsid)); + assert!(vcf.supports(QueryKind::GenotypeByLocus)); + + let cram = GenotypeStore::from_file_with_options( + &dir.join("sample.dat"), + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + assert_eq!(cram.backend_name(), "cram"); + assert!(!cram.supports(QueryKind::GenotypeByRsid)); + assert!(cram.supports(QueryKind::GenotypeByLocus)); +} + #[test] fn genotype_store_from_bytes_handles_vcf() { let store = GenotypeStore::from_bytes( @@ -101,6 +179,71 @@ fn genotype_store_from_bytes_handles_vcf() { assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); } +#[test] +fn extensionless_vcf_is_detected_by_content_and_can_be_forced() { + let dir = temp_dir("extensionless-vcf"); + let path = dir.join("sample.data"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=GRCh37\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let detected = GenotypeStore::from_file(&path).unwrap(); + assert_eq!(detected.backend_name(), "vcf"); + assert_eq!(detected.get("rs1").unwrap().as_deref(), Some("AG")); + + let forced = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Vcf), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + assert_eq!(forced.backend_name(), "vcf"); + assert_eq!(forced.get("rs1").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn vcf_file_lookup_handles_gt_field_order_no_calls_and_bad_positions() { + let dir = temp_dir("vcf-field-order"); + let path = dir.join("sample.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=GRCh38\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs1\tA\tG\t.\tPASS\t.\tDP:GT\t14:0|1\n\ + 1\t11\trs2\tC\tT\t.\tPASS\t.\tGT:DP\t./.:9\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + assert_eq!(store.get("rs1").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs2").unwrap().as_deref(), Some("--")); + + let bad_path = dir.join("bad.vcf"); + fs::write( + &bad_path, + "##fileformat=VCFv4.2\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\tnot-a-pos\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + let err = GenotypeStore::from_file(&bad_path) + .unwrap() + .get("rs1") + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to parse VCF position 'not-a-pos'"), + "{err:?}" + ); +} + #[test] fn genotype_store_from_bytes_handles_zip() { let bytes = zip_bytes( @@ -228,6 +371,52 @@ fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { ); } +#[test] +fn batch_lookup_preserves_input_order_after_coordinate_sorting() { + let dir = temp_dir("batch-order"); + let path = dir.join("sample.txt"); + fs::write( + &path, + "rsid\tchromosome\tposition\tgenotype\n\ + rs2\t1\t20\tCT\n\ + rs1\t1\t10\tAG\n", + ) + .unwrap(); + let store = GenotypeStore::from_file(&path).unwrap(); + + let results = store + .lookup_variants(&[ + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 20, + end: 20, + }), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + ..VariantSpec::default() + }, + ]) + .unwrap(); + + assert_eq!(results[0].genotype.as_deref(), Some("CT")); + assert_eq!( + results[0].evidence, + vec!["resolved by locus 1:20".to_owned()] + ); + assert_eq!(results[1].genotype.as_deref(), Some("AG")); + assert_eq!( + results[1].evidence, + vec!["resolved by locus 1:10".to_owned()] + ); +} + #[test] fn zip_genotype_file_is_auto_detected_and_readable() { let dir = temp_dir("zip-auto"); diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index 635dbc9..402a5ce 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -5,8 +5,9 @@ use std::{ }; use bioscript_schema::{ - RemoteResourceKind, load_variant_manifest_text, load_variant_manifest_text_for_lookup, - resolve_remote_resource_text, validate_panels_path, validate_variants_path, + RemoteResourceKind, load_panel_manifest, load_variant_manifest_text, + load_variant_manifest_text_for_lookup, resolve_remote_resource_text, validate_panels_path, + validate_variants_path, }; fn temp_dir(label: &str) -> PathBuf { @@ -105,6 +106,47 @@ provenance: assert_eq!(report.total_warnings(), 0); } +#[test] +fn validate_variants_scans_nested_yaml_files_and_ignores_other_files() { + let dir = temp_dir("validate-dir"); + let nested = dir.join("nested"); + fs::create_dir_all(&nested).unwrap(); + fs::write(dir.join("notes.txt"), "not yaml").unwrap(); + fs::write( + dir.join("valid.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "rs1" +identifiers: + rsids: ["rs1"] +alleles: + kind: "snv" + ref: "A" + alts: ["G"] +"#, + ) + .unwrap(); + fs::write( + nested.join("missing-schema.yml"), + r#" +version: "1.0" +name: "rs2" +"#, + ) + .unwrap(); + + let report = validate_variants_path(&dir).unwrap(); + let text = report.render_text(); + + assert_eq!(report.files_scanned, 2); + assert!(report.has_errors()); + assert_eq!(report.total_errors(), 1); + assert!(text.contains("missing-schema.yml")); + assert!(text.contains("missing schema")); + assert!(!text.contains("notes.txt")); +} + #[test] fn load_variant_manifest_text_accepts_start_end_coordinates() { let manifest = load_variant_manifest_text( @@ -136,6 +178,45 @@ alleles: assert_eq!(grch38.end, 45_679_786); } +#[test] +fn load_panel_manifest_parses_downloads_permissions_and_member_metadata() { + let dir = temp_dir("load-panel"); + let fixture = dir.join("panel.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "traits-common" +tags: ["type:trait"] +permissions: + domains: + - "https://example.org" +downloads: + - id: "remote-rs1" + url: "https://example.org/variants/rs1.yaml" + sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + version: "2026-01-01" +members: + - kind: "variant" + download: "remote-rs1" + sha256: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + version: "2026-01-01" +"#, + ) + .unwrap(); + + let panel = load_panel_manifest(&fixture).unwrap(); + + assert_eq!(panel.name, "traits-common"); + assert_eq!(panel.tags, vec!["type:trait"]); + assert_eq!(panel.permissions.domains, vec!["https://example.org"]); + assert_eq!(panel.downloads.len(), 1); + assert_eq!(panel.downloads[0].origin, "https://example.org"); + assert_eq!(panel.members.len(), 1); + assert_eq!(panel.members[0].download.as_deref(), Some("remote-rs1")); +} + #[test] fn lookup_compile_allows_non_execution_metadata_issues() { let text = r#" @@ -224,6 +305,59 @@ members: assert_eq!(report.total_warnings(), 0); } +#[test] +fn validate_panels_reports_member_and_download_shape_issues() { + let dir = temp_dir("validate-panel-shape"); + let fixture = dir.join("panel.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "traits-common" +permissions: + domains: + - "https://example.org/path" + - "ftp://example.org" + - "https://example.org" + - "https://example.org" +downloads: + - id: "remote-rs1" + url: "https://example.org/variants/rs1.yaml" + sha256: "not-a-sha" + version: "1.0" + - id: "remote-rs1" + url: "file:///tmp/rs1.yaml" + sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + version: "" +members: + - kind: "script" + path: "variants/rs1.yaml" + download: "remote-rs1" + sha256: "not-a-sha" + version: "" + - kind: "variant" + download: "missing-download" + - kind: "variant" + path: "" +"#, + ) + .unwrap(); + + let report = validate_panels_path(&fixture).unwrap(); + let text = report.render_text(); + + assert!(report.total_errors() >= 11, "{text}"); + assert!(report.total_warnings() >= 1, "{text}"); + assert!(text.contains("expected origin only")); + assert!(text.contains("expected http or https origin")); + assert!(text.contains("duplicate origin")); + assert!(text.contains("duplicate download id")); + assert!(text.contains("unknown download id")); + assert!(text.contains("unsupported member kind")); + assert!(text.contains("expected exactly one of path or download")); +} + #[test] fn remote_resource_resolution_detects_panel_members() { let text = r#" From 4e9b92764e270a6445c29a4c90c519d5c1b76c23 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 14:46:22 +1000 Subject: [PATCH 4/9] more tests --- rust/bioscript-cli/tests/cli.rs | 395 +++++++++++++++++- rust/bioscript-formats/tests/file_formats.rs | 248 +++++++++++ rust/bioscript-formats/tests/prepare.rs | 30 ++ rust/bioscript-runtime/tests/security.rs | 105 +++++ .../tests/validate_variants.rs | 238 +++++++++++ rust/bioscript-wasm/src/js_reader.rs | 11 +- 6 files changed, 1019 insertions(+), 8 deletions(-) diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 4e8daf0..e855032 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -1,7 +1,8 @@ use std::{ + ffi::OsStr, fs, path::PathBuf, - process::Command, + process::{Command, Output}, time::{SystemTime, UNIX_EPOCH}, }; @@ -27,6 +28,153 @@ fn temp_dir(label: &str) -> PathBuf { dir } +fn run_bioscript(root: &PathBuf, args: I) -> Output +where + I: IntoIterator, + S: AsRef, +{ + Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(root) + .args(args) + .output() + .unwrap() +} + +fn stderr_text(output: &Output) -> String { + String::from_utf8_lossy(&output.stderr).into_owned() +} + +#[test] +fn cli_reports_usage_when_no_script_or_subcommand_is_provided() { + let root = repo_root(); + + let output = run_bioscript(&root, std::iter::empty::<&str>()); + + assert!(!output.status.success()); + let stderr = stderr_text(&output); + assert!(stderr.contains("usage: bioscript"), "{stderr}"); + assert!(stderr.contains("validate-variants"), "{stderr}"); + assert!(stderr.contains("inspect "), "{stderr}"); +} + +#[test] +fn cli_rejects_missing_values_and_unexpected_arguments() { + let root = repo_root(); + + for (args, expected) in [ + (vec!["--root"], "--root requires a directory"), + (vec!["--input-file"], "--input-file requires a path"), + (vec!["--output-file"], "--output-file requires a path"), + ( + vec!["--participant-id"], + "--participant-id requires a value", + ), + (vec!["--trace-report"], "--trace-report requires a path"), + (vec!["--timing-report"], "--timing-report requires a path"), + (vec!["--filter"], "--filter requires key=value"), + (vec!["--input-index"], "--input-index requires a path"), + (vec!["--reference-file"], "--reference-file requires a path"), + ( + vec!["--reference-index"], + "--reference-index requires a path", + ), + (vec!["--cache-dir"], "--cache-dir requires a path"), + ( + vec!["bioscripts/hello-world.py", "extra"], + "unexpected argument: extra", + ), + (vec!["inspect"], "usage: bioscript inspect"), + ( + vec!["inspect", "bioscripts/hello-world.py", "extra"], + "unexpected argument: extra", + ), + (vec!["prepare", "--root"], "--root requires a directory"), + ( + vec!["prepare", "--cache-dir"], + "--cache-dir requires a path", + ), + ( + vec!["validate-variants", "--report"], + "--report requires a path", + ), + ( + vec!["validate-panels", "--report"], + "--report requires a path", + ), + ] { + let output = run_bioscript(&root, args); + assert!(!output.status.success(), "expected failure for {expected}"); + let stderr = stderr_text(&output); + assert!(stderr.contains(expected), "{stderr}"); + } +} + +#[test] +fn cli_rejects_invalid_numeric_limits_and_input_formats() { + let root = repo_root(); + + for (args, expected) in [ + ( + vec!["--input-format", "bam", "bioscripts/hello-world.py"], + "invalid --input-format value bam", + ), + ( + vec!["--max-duration-ms", "soon", "bioscripts/hello-world.py"], + "invalid --max-duration-ms value soon", + ), + ( + vec!["--max-memory-bytes", "large", "bioscripts/hello-world.py"], + "invalid --max-memory-bytes value large", + ), + ( + vec!["--max-allocations", "many", "bioscripts/hello-world.py"], + "invalid --max-allocations value many", + ), + ( + vec!["--max-recursion-depth", "deep", "bioscripts/hello-world.py"], + "invalid --max-recursion-depth value deep", + ), + ( + vec!["prepare", "--input-format", "bam"], + "invalid --input-format: unsupported input format: bam", + ), + ] { + let output = run_bioscript(&root, args); + assert!(!output.status.success(), "expected failure for {expected}"); + let stderr = stderr_text(&output); + assert!(stderr.contains(expected), "{stderr}"); + } +} + +#[test] +fn cli_rejects_unsupported_manifest_schema() { + let root = repo_root(); + let dir = temp_dir("unsupported-manifest"); + let manifest = dir.join("unsupported.yaml"); + fs::write( + &manifest, + r#" +schema: "bioscript:catalogue:1.0" +version: "1.0" +name: "catalogue" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg(&manifest) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = stderr_text(&output); + assert!( + stderr.contains("unsupported manifest schema 'bioscript:catalogue:1.0'"), + "{stderr}" + ); +} + #[test] fn hello_world_script_runs_via_cli_and_writes_within_root() { let root = repo_root(); @@ -123,6 +271,63 @@ fn timing_report_is_written_for_hello_world() { assert!(timing.contains("script=bioscripts/hello-world.py")); } +#[test] +fn auto_index_adds_reference_index_timing_for_script_runs() { + let root = repo_root(); + let dir = temp_dir("auto-index-script"); + let cache_dir = dir.join("cache"); + let timing_path = dir.join("reports/timing.tsv"); + fs::write(dir.join("ref.fa"), b">chr1\nACGT\n").unwrap(); + fs::write( + dir.join("script.py"), + r#" +def main(): + print("indexed") + + +if __name__ == "__main__": + main() +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--root") + .arg(&dir) + .arg("--reference-file") + .arg("ref.fa") + .arg("--auto-index") + .arg("--cache-dir") + .arg(&cache_dir) + .arg("--timing-report") + .arg(&timing_path) + .arg(dir.join("script.py")) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("bioscript: auto-indexed reference ->"), + "{stderr}" + ); + assert!(fs::read_dir(&cache_dir).unwrap().any(|entry| { + entry + .unwrap() + .path() + .extension() + .is_some_and(|ext| ext == "fai") + })); + let timing = fs::read_to_string(timing_path).unwrap(); + assert!(timing.contains("auto_index\t"), "{timing}"); + assert!(timing.contains("run_file_total\t"), "{timing}"); +} + #[test] fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { let root = repo_root(); @@ -360,6 +565,65 @@ alleles: assert!(stdout.contains("AG")); } +#[test] +fn variant_manifest_writes_output_trace_and_participant_id() { + let root = repo_root(); + let dir = temp_dir("variant-manifest-output"); + let manifest = dir.join("rs1.yaml"); + let output_path = dir.join("reports/variant.tsv"); + let trace_path = dir.join("reports/variant.trace.tsv"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--output-file") + .arg(&output_path) + .arg("--participant-id") + .arg("participant-1") + .arg("--trace-report") + .arg(&trace_path) + .arg(&manifest) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + assert!(String::from_utf8_lossy(&output.stdout).is_empty()); + let table = fs::read_to_string(output_path).unwrap(); + assert!(table.contains("participant-1"), "{table}"); + assert!(table.contains("example-rs73885319"), "{table}"); + let trace = fs::read_to_string(trace_path).unwrap(); + assert!(trace.contains("step\tline\tcode"), "{trace}"); + assert!(trace.contains("rs1.yaml"), "{trace}"); +} + #[test] fn panel_manifest_runs_directly_via_cli() { let root = repo_root(); @@ -451,3 +715,132 @@ members: assert!(stdout.contains("example-rs73885319")); assert!(!stdout.contains("example-rs60910145")); } + +#[test] +fn panel_manifest_filters_by_kind_tag_path_and_rejects_unknown_filter_keys() { + let root = repo_root(); + let dir = temp_dir("panel-filters"); + let variants_dir = dir.join("variants"); + fs::create_dir_all(&variants_dir).unwrap(); + fs::write( + variants_dir.join("rs73885319.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "example-panel" +members: + - kind: "variant" + path: "variants/rs73885319.yaml" + version: "1.0" +"#, + ) + .unwrap(); + + let matched = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--filter") + .arg("kind=variant") + .arg("--filter") + .arg("tag=type:trait") + .arg("--filter") + .arg("path=rs73885319") + .arg(&panel) + .output() + .unwrap(); + + assert!( + matched.status.success(), + "stderr: {}", + String::from_utf8_lossy(&matched.stderr) + ); + let stdout = String::from_utf8_lossy(&matched.stdout); + assert!(stdout.contains("example-rs73885319"), "{stdout}"); + + let filtered_out = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--filter") + .arg("unknown=value") + .arg(&panel) + .output() + .unwrap(); + + assert!( + filtered_out.status.success(), + "stderr: {}", + String::from_utf8_lossy(&filtered_out.stderr) + ); + let stdout = String::from_utf8_lossy(&filtered_out.stdout); + assert!(stdout.starts_with("kind\tname\tpath"), "{stdout}"); + assert!(!stdout.contains("example-rs73885319"), "{stdout}"); +} + +#[test] +fn panel_manifest_reports_remote_members_as_not_executable_yet() { + let root = repo_root(); + let dir = temp_dir("panel-remote-member"); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "remote-panel" +permissions: + domains: + - "https://example.com" +downloads: + - id: "remote-rs73885319" + url: "https://example.com/rs73885319.yaml" + sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + version: "1.0" +members: + - kind: "variant" + download: "remote-rs73885319" + version: "1.0" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&panel) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("remote panel members are not executable yet"), + "{stderr}" + ); +} diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index b38bc38..546beca 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -298,6 +298,59 @@ fn alignment_index_parsers_handle_in_memory_bytes() { assert!(format!("{err:?}").contains("failed to parse tabix index bytes")); } +#[test] +fn alignment_reader_api_reports_invalid_cram_headers_without_real_fixtures() { + let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); + let repository = alignment::build_reference_repository_from_readers( + std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), + fai, + ); + let locus = bioscript_core::GenomicLocus { + chrom: "chr1".to_owned(), + start: 1, + end: 1, + }; + let crai_bytes = fs::read(mini_fixtures_dir().join("mini.cram.crai")).unwrap(); + let mut reader = alignment::build_cram_indexed_reader_from_reader( + std::io::Cursor::new(b"not a cram".to_vec()), + alignment::parse_crai_bytes(&crai_bytes).unwrap(), + repository, + ) + .unwrap(); + + let err = + alignment::for_each_cram_record_with_reader(&mut reader, "bad.cram", &locus, |_| Ok(true)) + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to read CRAM header bad.cram"), + "{err:?}" + ); + + let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); + let repository = alignment::build_reference_repository_from_readers( + std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), + fai, + ); + let mut raw_reader = alignment::build_cram_indexed_reader_from_reader( + std::io::Cursor::new(b"still not a cram".to_vec()), + alignment::parse_crai_bytes(&crai_bytes).unwrap(), + repository, + ) + .unwrap(); + + let err = alignment::for_each_raw_cram_record_with_reader( + &mut raw_reader, + "raw-bad.cram", + &locus, + |_| Ok(true), + ) + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to read CRAM header raw-bad.cram"), + "{err:?}" + ); +} + #[test] fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { let dir = temp_dir("csv-split-alleles"); @@ -334,6 +387,39 @@ fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { ); } +#[test] +fn delimited_parser_handles_space_delimited_rows_without_headers_and_inline_comments() { + let dir = temp_dir("space-default-header"); + let path = dir.join("sample.txt"); + fs::write( + &path, + "\n\ + rsSpace chr2 200 tc # inline comment\n\ + chrOnly chr2 201 aa\n\ + badrow\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + + assert_eq!(store.get("rsSpace").unwrap().as_deref(), Some("TC")); + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "2".to_owned(), + start: 201, + end: 201, + }), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(observation.genotype.as_deref(), Some("AA")); + assert_eq!( + observation.evidence, + vec!["resolved by locus chr2:201".to_owned()] + ); +} + #[test] fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { let dir = temp_dir("vcf-chr-normalize"); @@ -371,6 +457,168 @@ fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { ); } +#[test] +fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { + let dir = temp_dir("vcf-indel-locus"); + let path = dir.join("sample.hg19.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=hg19\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t99\t.\tAT\tA\t.\tPASS\t.\tGT\t0/1\n\ + chr1\t199\t.\tA\tATG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + let deletion = store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "chr1".to_owned(), + start: 100, + end: 100, + }), + reference: Some("AT".to_owned()), + alternate: Some("A".to_owned()), + kind: Some(VariantKind::Deletion), + deletion_length: Some(1), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(deletion.genotype.as_deref(), Some("ID")); + assert_eq!(deletion.assembly, Some(bioscript_core::Assembly::Grch37)); + assert_eq!(deletion.evidence, vec!["resolved by locus 1:99".to_owned()]); + + let insertion = store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 200, + end: 200, + }), + reference: Some("A".to_owned()), + alternate: Some("ATG".to_owned()), + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(insertion.genotype.as_deref(), Some("DI")); + assert_eq!( + insertion.evidence, + vec!["resolved by locus chr1:199".to_owned()] + ); + + let unresolved = store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 300, + end: 300, + }), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(unresolved.genotype, None); + assert_eq!( + unresolved.evidence, + vec!["no matching rsid or locus found for variant_by_locus".to_owned()] + ); +} + +#[test] +fn forced_cram_backend_reports_early_argument_errors_without_reading_cram() { + let dir = temp_dir("cram-early-errors"); + let cram_path = dir.join("missing.cram"); + let reference = dir.join("GRCh38.fa"); + let store_without_reference = GenotypeStore::from_file_with_options( + &cram_path, + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + + let err = store_without_reference + .lookup_variant(&VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }) + .unwrap_err(); + assert!( + format!("{err:?}").contains("without --reference-file"), + "{err:?}" + ); + + let store = GenotypeStore::from_file_with_options( + &cram_path, + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + reference_file: Some(reference), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + + let err = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("needs GRCh37/GRCh38 coordinates")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Snp), + alternate: Some("G".to_owned()), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("SNP variant requires ref/reference")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to open indexed FASTA"), + "{err:?}" + ); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Other), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("does not yet support Other")); +} + #[test] fn batch_lookup_preserves_input_order_after_coordinate_sorting() { let dir = temp_dir("batch-order"); diff --git a/rust/bioscript-formats/tests/prepare.rs b/rust/bioscript-formats/tests/prepare.rs index 63d86c0..d533b41 100644 --- a/rust/bioscript-formats/tests/prepare.rs +++ b/rust/bioscript-formats/tests/prepare.rs @@ -185,6 +185,21 @@ fn bam_without_adjacent_index_returns_clear_error() { ); } +#[test] +fn cram_without_adjacent_index_reports_build_failure() { + let root = temp_dir("invalid-cram-root"); + let cwd = temp_dir("invalid-cram-cwd"); + fs::write(root.join("sample.cram"), b"not a real cram").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("sample.cram".to_owned()); + + let err = prepare_indexes(&req).unwrap_err(); + + assert!(err.contains("failed to build alignment index"), "{err}"); + assert!(err.contains("sample.cram"), "{err}"); +} + #[test] fn adjacent_fasta_index_is_detected() { let root = temp_dir("adjacent-fasta-root"); @@ -211,6 +226,21 @@ fn adjacent_fasta_index_is_detected() { ); } +#[test] +fn invalid_fasta_reference_reports_index_build_failure() { + let root = temp_dir("invalid-fasta-root"); + let cwd = temp_dir("invalid-fasta-cwd"); + fs::write(root.join("ref.fa"), b"not fasta\n").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.reference_file = Some("ref.fa".to_owned()); + + let err = prepare_indexes(&req).unwrap_err(); + + assert!(err.contains("failed to build FASTA index"), "{err}"); + assert!(err.contains("ref.fa"), "{err}"); +} + #[test] fn fasta_index_is_generated_in_cache_when_missing() { let root = temp_dir("generated-fasta-root"); diff --git a/rust/bioscript-runtime/tests/security.rs b/rust/bioscript-runtime/tests/security.rs index 312d683..0cabe59 100644 --- a/rust/bioscript-runtime/tests/security.rs +++ b/rust/bioscript-runtime/tests/security.rs @@ -172,3 +172,108 @@ if __name__ == "__main__": assert!(details.contains("genotype=None"), "{details}"); assert!(details.contains("genotype='--'"), "{details}"); } + +#[test] +fn runtime_trace_report_records_rsid_and_coordinate_lookup_metadata() { + let dir = temp_dir("trace-metadata"); + let script = dir.join("script.py"); + let trace = dir.join("reports/trace.tsv"); + fs::write( + &script, + r#" +RSID = bioscript.variant(rsid="rs73885319") +COORD = bioscript.variant(grch38="chr22:36265860-36265860", ref="A", alt="G", kind="snp") + +def main(): + text = str(RSID) + str(COORD) + +if __name__ == "__main__": + main() +"#, + ) + .unwrap(); + + let runtime = BioscriptRuntime::new(&dir).unwrap(); + runtime.run_file(&script, Some(&trace), Vec::new()).unwrap(); + + let report = fs::read_to_string(trace).unwrap(); + assert!(report.contains("lookup_key\tlookup_url"), "{report}"); + assert!(report.contains("rs73885319"), "{report}"); + assert!( + report.contains("https://www.ncbi.nlm.nih.gov/snp/rs73885319"), + "{report}" + ); + assert!(report.contains("22:36265860-36265860"), "{report}"); + assert!( + report.contains("https://www.ensembl.org/Homo_sapiens/Location/View"), + "{report}" + ); +} + +#[test] +fn runtime_write_tsv_serializes_rows_and_records_timing() { + let dir = temp_dir("write-tsv"); + let runtime = run_script_with_inputs( + &dir, + r#" +def main(): + bioscript.write_tsv("outputs/table.tsv", [ + {"name": "alpha", "count": 2, "ok": True, "empty": None}, + {"name": "beta", "count": 3, "ok": False}, + ]) + +if __name__ == "__main__": + main() +"#, + Vec::new(), + ) + .unwrap(); + + let table = fs::read_to_string(dir.join("outputs/table.tsv")).unwrap(); + assert!(table.contains("count\tempty\tname\tok"), "{table}"); + assert!(table.contains("2\t\talpha\ttrue"), "{table}"); + assert!(table.contains("3\t\tbeta\tfalse"), "{table}"); + assert!( + runtime + .timing_snapshot() + .iter() + .any(|timing| timing.stage == "write_tsv") + ); +} + +#[test] +fn runtime_reports_host_method_argument_errors() { + for (code, expected) in [ + ( + "bioscript.variant(rsid=123)\n", + "expected string or list of strings", + ), + ( + "bioscript.variant(foo='bar')\n", + "bioscript.variant does not accept keyword 'foo'", + ), + ( + "bioscript.variant(grch38='not-a-locus')\n", + "invalid locus string", + ), + ( + "bioscript.variant(kind='structural')\n", + "invalid variant kind", + ), + ( + "bioscript.query_plan('not a plan')\n", + "expected a list of Variant objects or a VariantPlan", + ), + ( + "bioscript.write_tsv('outputs/table.tsv', 'not rows')\n", + "write_tsv expects a list of dict rows", + ), + ( + "bioscript.read_text(path='inputs/source.txt')\n", + "read_text does not accept keyword arguments", + ), + ] { + let err = run_script(code).unwrap_err(); + assert!(err.contains(expected), "{err}"); + } +} diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index 402a5ce..f6530f0 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -358,6 +358,203 @@ members: assert!(text.contains("expected exactly one of path or download")); } +#[test] +fn validate_variants_reports_type_and_metadata_issues() { + let dir = temp_dir("validate-variant-edges"); + fs::write( + dir.join("typed-shape.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "2.0" +name: "" +label: 42 +gene: "" +summary: "" +tags: "type:trait" +identifiers: + rsids: "rs1" +coordinates: + grch37: + chrom: "0" + pos: "one" +alleles: + kind: "other" + canonical_alt: "G" + ref: "" + alts: + - 1 + - "" +findings: + - "not-a-map" + - schema: "bioscript:trait:1.0" + alt: "G" +provenance: + sources: + - "not-a-map" + - kind: "" + label: "" + url: "mailto:example" +"#, + ) + .unwrap(); + + let report = validate_variants_path(&dir).unwrap(); + let text = report.render_text(); + + assert_eq!(report.files_scanned, 1); + assert!(report.total_errors() >= 17, "{text}"); + assert!(report.total_warnings() >= 4, "{text}"); + for expected in [ + "expected '1.0'", + "expected string", + "expected a sequence of strings", + "expected integer", + "canonical_alt is not part of the current schema", + "expected one of snv, deletion, insertion, indel", + "finding alt 'G' is not present", + "expected http or https URL", + ] { + assert!(text.contains(expected), "{expected}\n{text}"); + } +} + +#[test] +fn validate_variants_reports_coordinate_edge_cases() { + let dir = temp_dir("validate-variant-coordinate-edges"); + fs::write( + dir.join("coordinate-range.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "range" +identifiers: + rsids: + - "rs1" + - "rs1" +coordinates: + grch38: + chrom: "MT" + start: 20 + end: 10 +alleles: + kind: "snv" + ref: "A" + alts: + - "N" +"#, + ) + .unwrap(); + fs::write( + dir.join("single-position-range.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "single-position-range" +coordinates: + grch38: + chrom: "X" + start: 5 + end: 5 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + fs::write( + dir.join("pos-and-range.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "both-coordinate-styles" +coordinates: + grch38: + chrom: "Y" + pos: 5 + start: 5 + end: 6 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + + let report = validate_variants_path(&dir).unwrap(); + let text = report.render_text(); + + assert_eq!(report.files_scanned, 3); + assert!(report.total_errors() >= 3, "{text}"); + assert!(report.total_warnings() >= 2, "{text}"); + for expected in [ + "duplicate identifier 'rs1'", + "expected end >= start", + "single-position coordinate uses start/end", + "use either pos or start/end", + ] { + assert!(text.contains(expected), "{expected}\n{text}"); + } +} + +#[test] +fn validate_panels_reports_missing_empty_and_type_issues() { + let dir = temp_dir("validate-panel-edges"); + fs::write( + dir.join("missing-members.yaml"), + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "missing-members" +label: 7 +summary: "" +tags: "type:trait" +permissions: + domains: "https://example.org" +downloads: + - "not-a-map" +"#, + ) + .unwrap(); + fs::write( + dir.join("empty-members.yaml"), + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "empty-members" +permissions: + domains: + - 3 + - "https://" + - "https://example.org:8443" +members: [] +"#, + ) + .unwrap(); + + let report = validate_panels_path(&dir).unwrap(); + let text = report.render_text(); + + assert_eq!(report.files_scanned, 2); + assert!(report.total_errors() >= 7, "{text}"); + assert!(report.total_warnings() >= 1, "{text}"); + for expected in [ + "expected a sequence of strings", + "downloads[0]: expected mapping", + "members: missing required field", + "members: expected at least one member", + "permissions.domains[0]: expected string", + "invalid URL", + "expected string", + "empty string", + ] { + assert!(text.contains(expected), "{expected}\n{text}"); + } +} + #[test] fn remote_resource_resolution_detects_panel_members() { let text = r#" @@ -542,3 +739,44 @@ fn remote_resource_resolution_reports_invalid_structured_text() { "{err}" ); } + +#[test] +fn remote_resource_resolution_handles_json_versions_and_plain_relative_urls() { + let resolved = resolve_remote_resource_text( + "https://example.com/catalogues/index.json", + "assay.json", + r#" +{ + "name": "json-assay", + "assay": { + "version": "2026.4", + "panel": "panels/common.yaml" + }, + "artifact_url": "../artifacts/compiled.json" +} +"#, + ) + .unwrap(); + + assert_eq!(resolved.kind, RemoteResourceKind::Assay); + assert_eq!(resolved.version.as_deref(), Some("2026.4")); + let urls = resolved + .dependencies + .iter() + .map(|dependency| dependency.url.as_str()) + .collect::>(); + assert_eq!( + urls, + vec![ + "https://example.com/artifacts/compiled.json", + "https://example.com/catalogues/panels/common.yaml", + ] + ); + + let err = + resolve_remote_resource_text("https://example.com/bad.json", "bad.json", "{").unwrap_err(); + assert!( + err.contains("failed to parse JSON resource bad.json"), + "{err}" + ); +} diff --git a/rust/bioscript-wasm/src/js_reader.rs b/rust/bioscript-wasm/src/js_reader.rs index e28af35..7dc49c4 100644 --- a/rust/bioscript-wasm/src/js_reader.rs +++ b/rust/bioscript-wasm/src/js_reader.rs @@ -44,13 +44,10 @@ impl Read for JsReader { &JsValue::from_f64(want as f64), ) .map_err(|err| { - io::Error::new( - io::ErrorKind::Other, - format!( - "{} readAt({}, {}) threw: {:?}", - self.label, self.position, want, err - ), - ) + io::Error::other(format!( + "{} readAt({}, {}) threw: {:?}", + self.label, self.position, want, err + )) })?; let array = Uint8Array::from(result); let got = array.byte_length() as usize; From 600af27614c098d46cca65ad935fe436aa2af742 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 15:05:13 +1000 Subject: [PATCH 5/9] more tests --- rust/bioscript-formats/tests/file_formats.rs | 230 ++++++++++- rust/bioscript-formats/tests/inspect.rs | 155 ++++++++ rust/bioscript-runtime/tests/security.rs | 388 +++++++++++++++++++ 3 files changed, 759 insertions(+), 14 deletions(-) diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index 546beca..05d8702 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -179,6 +179,32 @@ fn genotype_store_from_bytes_handles_vcf() { assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); } +#[test] +fn vcf_bytes_skip_unusable_rows_and_decode_no_call_forms() { + let store = GenotypeStore::from_bytes( + "sample.vcf", + b"##fileformat=VCFv4.2\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\t.\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ + 1\t11\trsEmptyRef\t.\tG\t.\tPASS\t.\tGT\t0/1\n\ + 1\t12\trsEmptyAlt\tA\t.\t.\tPASS\t.\tGT\t0/1\n\ + 1\t13\trsShort\tA\tG\n\ + 1\t14\trsNoCall\tA\tG\t.\tPASS\t.\tGT\t.\n\ + 1\t15\trsPartialNoCall\tA\tG\t.\tPASS\t.\tGT\t./1\n\ + 1\t16\trsOutOfRange\tA\tG\t.\tPASS\t.\tGT\t0/2\n\ + 1\t17\trsValid\tC\tT\t.\tPASS\t.\tGT\t1|1\n", + ) + .unwrap(); + + assert_eq!(store.backend_name(), "vcf"); + assert_eq!(store.get("rsValid").unwrap().as_deref(), Some("TT")); + assert_eq!(store.get("rsNoCall").unwrap().as_deref(), Some("--")); + assert_eq!(store.get("rsPartialNoCall").unwrap().as_deref(), Some("--")); + assert_eq!(store.get("rsOutOfRange").unwrap(), None); + assert_eq!(store.get("rsEmptyRef").unwrap().as_deref(), Some(".G")); + assert_eq!(store.get("rsEmptyAlt").unwrap(), None); +} + #[test] fn extensionless_vcf_is_detected_by_content_and_can_be_forced() { let dir = temp_dir("extensionless-vcf"); @@ -257,6 +283,40 @@ fn genotype_store_from_bytes_handles_zip() { assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); } +#[test] +fn rsid_map_batch_lookup_preserves_order_and_reports_missing_rsids() { + let store = GenotypeStore::from_bytes( + "sample.txt", + b"rsid\tchromosome\tposition\tgenotype\nrs2\t1\t20\tCT\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + + let results = store + .lookup_variants(&[ + VariantSpec { + rsids: vec!["rs2".to_owned()], + ..VariantSpec::default() + }, + VariantSpec { + rsids: vec!["rsMissing".to_owned()], + ..VariantSpec::default() + }, + VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }, + ]) + .unwrap(); + + assert_eq!(results[0].genotype.as_deref(), Some("CT")); + assert_eq!(results[1].genotype, None); + assert_eq!( + results[1].evidence, + vec!["no matching rsid found".to_owned()] + ); + assert_eq!(results[2].genotype.as_deref(), Some("AG")); +} + #[test] fn genotype_store_from_bytes_rejects_malformed_zip() { let err = GenotypeStore::from_bytes("sample.zip", b"not a zip").unwrap_err(); @@ -387,6 +447,39 @@ fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { ); } +#[test] +fn delimited_parser_uses_comment_headers_aliases_quotes_and_extra_columns() { + let dir = temp_dir("comment-header-aliases"); + let path = dir.join("sample.csv"); + fs::write( + &path, + "# SNP ID, Chrom, Base Pair Position, Result, Ignored\n\ + \"rsQuoted\", \"chr3\", \"300\", \"a t\", \"unused, value\"\n\ + rsSlash,3,301,A/-,\n\ + rsNone,3,302,None,\n\ + no_position,3,,AG,\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + + assert_eq!(store.get("rsQuoted").unwrap().as_deref(), Some("AT")); + assert_eq!(store.get("rsSlash").unwrap().as_deref(), Some("ID")); + assert_eq!(store.get("rsNone").unwrap().as_deref(), Some("--")); + assert_eq!(store.get("no_position").unwrap().as_deref(), Some("AG")); + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "3".to_owned(), + start: 300, + end: 300, + }), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(observation.genotype.as_deref(), Some("AT")); +} + #[test] fn delimited_parser_handles_space_delimited_rows_without_headers_and_inline_comments() { let dir = temp_dir("space-default-header"); @@ -528,11 +621,23 @@ fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { ); } +fn forced_cram_store(dir: &std::path::Path, reference_name: &str) -> GenotypeStore { + GenotypeStore::from_file_with_options( + &dir.join("missing.cram"), + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + reference_file: Some(dir.join(reference_name)), + reference_index: Some(dir.join(format!("{reference_name}.fai"))), + input_index: Some(dir.join("missing.cram.crai")), + }, + ) + .unwrap() +} + #[test] -fn forced_cram_backend_reports_early_argument_errors_without_reading_cram() { - let dir = temp_dir("cram-early-errors"); +fn forced_cram_backend_reports_reference_and_coordinate_errors_without_reading_cram() { + let dir = temp_dir("cram-reference-errors"); let cram_path = dir.join("missing.cram"); - let reference = dir.join("GRCh38.fa"); let store_without_reference = GenotypeStore::from_file_with_options( &cram_path, &GenotypeLoadOptions { @@ -541,7 +646,6 @@ fn forced_cram_backend_reports_early_argument_errors_without_reading_cram() { }, ) .unwrap(); - let err = store_without_reference .lookup_variant(&VariantSpec { rsids: vec!["rs1".to_owned()], @@ -553,16 +657,7 @@ fn forced_cram_backend_reports_early_argument_errors_without_reading_cram() { "{err:?}" ); - let store = GenotypeStore::from_file_with_options( - &cram_path, - &GenotypeLoadOptions { - format: Some(GenotypeSourceFormat::Cram), - reference_file: Some(reference), - ..GenotypeLoadOptions::default() - }, - ) - .unwrap(); - + let store = forced_cram_store(&dir, "GRCh38.fa"); let err = store .lookup_variant(&VariantSpec { rsids: vec!["rs1".to_owned()], @@ -570,7 +665,14 @@ fn forced_cram_backend_reports_early_argument_errors_without_reading_cram() { }) .unwrap_err(); assert!(format!("{err:?}").contains("needs GRCh37/GRCh38 coordinates")); + assert!(format!("{err:?}").contains("reference index")); + assert!(format!("{err:?}").contains("input index")); +} +#[test] +fn forced_cram_backend_reports_snp_and_indel_argument_errors_without_reading_cram() { + let dir = temp_dir("cram-variant-argument-errors"); + let store = forced_cram_store(&dir, "GRCh38.fa"); let err = store .lookup_variant(&VariantSpec { grch38: Some(bioscript_core::GenomicLocus { @@ -585,6 +687,65 @@ fn forced_cram_backend_reports_early_argument_errors_without_reading_cram() { .unwrap_err(); assert!(format!("{err:?}").contains("SNP variant requires ref/reference")); + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("SNP variant requires alt/alternate")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Deletion), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("deletion variant requires deletion_length")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Indel), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("indel variant requires ref/reference")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("indel variant requires alt/alternate")); +} + +#[test] +fn forced_cram_backend_reports_file_and_assembly_errors_without_reading_cram() { + let dir = temp_dir("cram-file-assembly-errors"); + let store = forced_cram_store(&dir, "GRCh38.fa"); let err = store .lookup_variant(&VariantSpec { grch38: Some(bioscript_core::GenomicLocus { @@ -617,6 +778,20 @@ fn forced_cram_backend_reports_early_argument_errors_without_reading_cram() { }) .unwrap_err(); assert!(format!("{err:?}").contains("does not yet support Other")); + + let hg19_store = forced_cram_store(&dir, "hg19.fa"); + let err = hg19_store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Other), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("does not yet support Other")); } #[test] @@ -739,6 +914,33 @@ fn zip_vcf_entry_is_auto_detected_and_readable() { assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); } +#[test] +fn zip_vcf_gz_entry_is_selected_and_read_as_vcf() { + let dir = temp_dir("zip-vcf-gz-entry"); + let zip_path = dir.join("sample.zip"); + + let file = fs::File::create(&zip_path).unwrap(); + let mut writer = zip::ZipWriter::new(file); + writer + .add_directory("nested/", SimpleFileOptions::default()) + .unwrap(); + writer + .start_file("nested/sample.vcf.gz", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all( + b"##fileformat=VCFv4.2\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +2\t22\trsZipVcfGz\tG\tA\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + writer.finish().unwrap(); + + let store = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(store.backend_name(), "vcf"); + assert_eq!(store.get("rsZipVcfGz").unwrap().as_deref(), Some("GA")); +} + #[test] fn shared_real_world_zipped_genotype_exports_are_readable() { struct FixtureExpectation { diff --git a/rust/bioscript-formats/tests/inspect.rs b/rust/bioscript-formats/tests/inspect.rs index 717444d..dc81a42 100644 --- a/rust/bioscript-formats/tests/inspect.rs +++ b/rust/bioscript-formats/tests/inspect.rs @@ -144,6 +144,103 @@ fn inspect_bytes_handles_unknown_bytes_conservatively() { ); } +#[test] +fn inspect_bytes_detects_alignment_reference_and_explicit_indexes() { + let dir = temp_dir("bytes-explicit-indexes"); + let index_path = dir.join("sample.cram.crai"); + std::fs::write(&index_path, b"not crai").unwrap(); + let cram = inspect_bytes( + "sample.cram", + b"not actually decoded", + &InspectOptions { + input_index: Some(index_path.clone()), + ..InspectOptions::default() + }, + ) + .unwrap(); + assert_eq!(cram.detected_kind, DetectedKind::AlignmentCram); + assert_eq!(cram.has_index, Some(index_path.exists())); + assert_eq!(cram.index_path.as_deref(), Some(index_path.as_path())); + assert!(cram.evidence.contains(&"extension .cram".to_owned())); + + let bam = inspect_bytes("sample.bam", b"bam bytes", &InspectOptions::default()).unwrap(); + assert_eq!(bam.detected_kind, DetectedKind::AlignmentBam); + assert_eq!( + bam.confidence, + bioscript_formats::DetectionConfidence::Authoritative + ); + + let reference_index = PathBuf::from("ref.fa.fai"); + let reference = inspect_bytes( + "ref.fa", + b">chr1\nACGT\n", + &InspectOptions { + reference_index: Some(reference_index.clone()), + ..InspectOptions::default() + }, + ) + .unwrap(); + assert_eq!(reference.detected_kind, DetectedKind::ReferenceFasta); + assert_eq!( + reference.index_path.as_deref(), + Some(reference_index.as_path()) + ); +} + +#[test] +fn inspect_bytes_zip_skips_macosx_entries_and_can_fallback_to_unknown_file() { + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .start_file("__MACOSX/._sample.txt", SimpleFileOptions::default()) + .unwrap(); + writer.write_all(b"ignored").unwrap(); + writer + .start_file("nested/readme.bin", SimpleFileOptions::default()) + .unwrap(); + writer.write_all(b"not genotype text").unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + + let inspection = inspect_bytes("archive.zip", &bytes, &InspectOptions::default()).unwrap(); + + assert_eq!(inspection.container, FileContainer::Zip); + assert_eq!(inspection.detected_kind, DetectedKind::Unknown); + assert_eq!( + inspection.selected_entry.as_deref(), + Some("nested/readme.bin") + ); +} + +#[test] +fn inspect_bytes_rejects_empty_or_malformed_zip_archives() { + let err = inspect_bytes("bad.zip", b"not a zip", &InspectOptions::default()).unwrap_err(); + assert!( + format!("{err:?}").contains("failed to read zip bytes"), + "{err:?}" + ); + + let cursor = std::io::Cursor::new(Vec::new()); + let writer = zip::ZipWriter::new(cursor); + let bytes = writer.finish().unwrap().into_inner(); + let err = inspect_bytes("empty.zip", &bytes, &InspectOptions::default()).unwrap_err(); + assert!( + format!("{err:?}").contains("zip archive does not contain a supported file"), + "{err:?}" + ); +} + +#[test] +fn inspection_render_text_includes_empty_source_fields() { + let inspection = + inspect_bytes("sample.fa", b">chr1\nACGT\n", &InspectOptions::default()).unwrap(); + let text = inspection.render_text(); + + assert!(text.contains("kind\treference_fasta"), "{text}"); + assert!(text.contains("vendor\t"), "{text}"); + assert!(text.contains("source_confidence\t"), "{text}"); + assert!(text.contains("reference_matches\t"), "{text}"); +} + #[test] fn ancestrydna_text_fixture_reports_vendor_platform_and_build() { let path = fixtures_dir().join("ancestrydna_v2_sample.txt"); @@ -384,6 +481,64 @@ fn chr_y_cram_fixture_reports_index_without_decoding_entire_file() { assert!(elapsed < 1000); } +#[test] +fn inspect_file_reports_bam_cram_and_reference_indexes_without_decoding() { + let dir = temp_dir("index-detection"); + let cram = dir.join("sample.cram"); + let cram_index = dir.join("sample.cram.crai"); + let bam = dir.join("sample.bam"); + let bai = dir.join("sample.bam.bai"); + let fasta = dir.join("reference.fa"); + let fai = dir.join("reference.fa.fai"); + std::fs::write(&cram, b"not cram").unwrap(); + std::fs::write(&cram_index, b"not crai").unwrap(); + std::fs::write(&bam, b"not bam").unwrap(); + std::fs::write(&bai, b"not bai").unwrap(); + std::fs::write(&fasta, b">chr1\nACGT\n").unwrap(); + std::fs::write(&fai, b"chr1\t4\t6\t4\t5\n").unwrap(); + + let cram_inspection = inspect_file(&cram, &InspectOptions::default()).unwrap(); + assert_eq!(cram_inspection.detected_kind, DetectedKind::AlignmentCram); + assert_eq!(cram_inspection.has_index, Some(true)); + assert_eq!( + cram_inspection.index_path.as_deref(), + Some(cram_index.as_path()) + ); + + let bam_inspection = inspect_file(&bam, &InspectOptions::default()).unwrap(); + assert_eq!(bam_inspection.detected_kind, DetectedKind::AlignmentBam); + assert_eq!(bam_inspection.has_index, Some(true)); + assert_eq!(bam_inspection.index_path.as_deref(), Some(bai.as_path())); + + let fasta_inspection = inspect_file(&fasta, &InspectOptions::default()).unwrap(); + assert_eq!(fasta_inspection.detected_kind, DetectedKind::ReferenceFasta); + assert_eq!(fasta_inspection.has_index, Some(true)); + assert_eq!(fasta_inspection.index_path.as_deref(), Some(fai.as_path())); +} + +#[test] +fn inspect_file_reports_missing_short_cram_and_bam_indexes() { + let dir = temp_dir("missing-index-detection"); + let cram = dir.join("sample.cram"); + let bam = dir.join("sample.bam"); + std::fs::write(&cram, b"not cram").unwrap(); + std::fs::write(&bam, b"not bam").unwrap(); + + let cram_inspection = inspect_file(&cram, &InspectOptions::default()).unwrap(); + assert_eq!(cram_inspection.has_index, Some(false)); + assert_eq!( + cram_inspection.index_path.as_deref(), + Some(dir.join("sample.crai").as_path()) + ); + + let bam_inspection = inspect_file(&bam, &InspectOptions::default()).unwrap(); + assert_eq!(bam_inspection.has_index, Some(false)); + assert_eq!( + bam_inspection.index_path.as_deref(), + Some(dir.join("sample.bai").as_path()) + ); +} + #[test] fn phased_vcf_reports_phasing() { let dir = temp_dir("phased-vcf"); diff --git a/rust/bioscript-runtime/tests/security.rs b/rust/bioscript-runtime/tests/security.rs index 0cabe59..b1ba2f3 100644 --- a/rust/bioscript-runtime/tests/security.rs +++ b/rust/bioscript-runtime/tests/security.rs @@ -264,6 +264,30 @@ fn runtime_reports_host_method_argument_errors() { "bioscript.query_plan('not a plan')\n", "expected a list of Variant objects or a VariantPlan", ), + ( + "bioscript.query_plan()\n", + "bioscript.query_plan expects self and a list of variants", + ), + ( + "bioscript.query_plan(variants=[])\n", + "bioscript.query_plan does not accept keyword arguments", + ), + ( + "bioscript.variant('rs1')\n", + "bioscript.variant expects only self as a positional argument", + ), + ( + "bioscript.load_genotypes()\n", + "bioscript.load_genotypes expects self and path", + ), + ( + "bioscript.load_genotypes(path='genotypes.txt')\n", + "bioscript.load_genotypes does not accept keyword arguments", + ), + ( + "bioscript.missing_method()\n", + "'Bioscript' object has no attribute 'missing_method'", + ), ( "bioscript.write_tsv('outputs/table.tsv', 'not rows')\n", "write_tsv expects a list of dict rows", @@ -277,3 +301,367 @@ fn runtime_reports_host_method_argument_errors() { assert!(err.contains(expected), "{err}"); } } + +#[test] +fn runtime_reports_genotype_method_argument_errors() { + let dir = temp_dir("genotype-method-errors"); + fs::write( + dir.join("genotypes.txt"), + "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + + for (call, expected) in [ + ("genotypes.get()", "GenotypeFile.get expects self and rsid"), + ( + "genotypes.get(123)", + "GenotypeFile.get expected str at position 1", + ), + ( + "genotypes.get(rsid='rs1')", + "GenotypeFile.get does not accept keyword arguments", + ), + ( + "genotypes.lookup_variant()", + "GenotypeFile.lookup_variant expects self and variant", + ), + ( + "genotypes.lookup_variant_details()", + "GenotypeFile.lookup_variant_details expects self and variant", + ), + ( + "genotypes.lookup_variants()", + "GenotypeFile.lookup_variants expects self and a variant plan", + ), + ( + "genotypes.lookup_variants_details()", + "GenotypeFile.lookup_variants_details expects self and a variant plan", + ), + ( + "genotypes.lookup_variants_details(plan=[])", + "GenotypeFile.lookup_variants_details does not accept keyword arguments", + ), + ( + "genotypes.missing_method()", + "'GenotypeFile' object has no attribute 'missing_method'", + ), + ] { + let code = format!( + r#" +def main(): + genotypes = bioscript.load_genotypes(input_file) + {call} + +if __name__ == "__main__": + main() +"# + ); + let Err(err) = run_script_with_inputs( + &dir, + &code, + vec![( + "input_file", + MontyObject::String("genotypes.txt".to_owned()), + )], + ) else { + panic!("expected genotype method call to fail: {call}"); + }; + assert!(err.contains(expected), "{err}"); + } +} + +#[test] +fn runtime_batch_lookup_methods_return_values_details_and_timings() { + let dir = temp_dir("batch-lookup-methods"); + fs::write( + dir.join("genotypes.txt"), + "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\nrs2\t1\t20\tCT\n", + ) + .unwrap(); + + let runtime = run_script_with_inputs( + &dir, + r#" +RS1 = bioscript.variant(rsid="rs1") +RS2 = bioscript.variant(rsid="rs2") +MISSING = bioscript.variant(rsid="rsMissing") + +def main(): + genotypes = bioscript.load_genotypes(input_file) + plan = bioscript.query_plan([RS2, MISSING, RS1]) + values = genotypes.lookup_variants(plan) + details = genotypes.lookup_variants_details(plan) + bioscript.write_text("outputs/batch.txt", str(values) + "\n" + str(details)) + +if __name__ == "__main__": + main() +"#, + vec![( + "input_file", + MontyObject::String("genotypes.txt".to_owned()), + )], + ) + .unwrap(); + + let output = fs::read_to_string(dir.join("outputs/batch.txt")).unwrap(); + assert!(output.contains("CT"), "{output}"); + assert!(output.contains("AG"), "{output}"); + assert!(output.contains("VariantObservation"), "{output}"); + assert!( + output.contains("no matching rsid or locus found"), + "{output}" + ); + let timings = runtime.timing_snapshot(); + assert!( + timings + .iter() + .any(|timing| timing.stage == "lookup_variants") + ); + assert!( + timings + .iter() + .any(|timing| timing.stage == "lookup_variants_details") + ); +} + +#[test] +fn runtime_single_lookup_methods_return_values_and_none() { + let dir = temp_dir("single-lookup-methods"); + fs::write( + dir.join("genotypes.txt"), + "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + + let runtime = run_script_with_inputs( + &dir, + r#" +RS1 = bioscript.variant(rsid="rs1") +MISSING = bioscript.variant(rsid="rsMissing") + +def main(): + genotypes = bioscript.load_genotypes(input_file) + values = [ + genotypes.get("rs1"), + genotypes.get("rsMissing"), + genotypes.lookup_variant(RS1), + genotypes.lookup_variant(MISSING), + ] + bioscript.write_text("outputs/single.txt", str(values)) + +if __name__ == "__main__": + main() +"#, + vec![( + "input_file", + MontyObject::String("genotypes.txt".to_owned()), + )], + ) + .unwrap(); + + let output = fs::read_to_string(dir.join("outputs/single.txt")).unwrap(); + assert!(output.contains("AG"), "{output}"); + assert!(output.contains("None"), "{output}"); + assert!( + runtime + .timing_snapshot() + .iter() + .any(|timing| timing.stage == "lookup_variant") + ); +} + +#[test] +fn runtime_variant_objects_preserve_optional_fields() { + let dir = temp_dir("variant-optional-fields"); + run_script_with_inputs( + &dir, + r#" +def main(): + insertion = bioscript.variant( + rsids=["rs1", "rs2"], + grch37="chr1:10-11", + grch38="2:20", + ref="A", + alt="AT", + kind="insertion", + motifs=["AT", "TA"], + ) + deletion = bioscript.variant( + grch38="3:30-32", + kind="deletion", + deletion_length=3, + ) + indel = bioscript.variant(kind="indel") + other = bioscript.variant(kind="other") + plan = bioscript.query_plan([insertion, deletion, indel, other]) + bioscript.write_text("outputs/variants.txt", str(plan)) + +if __name__ == "__main__": + main() +"#, + Vec::new(), + ) + .unwrap(); + + let output = fs::read_to_string(dir.join("outputs/variants.txt")).unwrap(); + assert!(output.contains("grch37='1:10-11'"), "{output}"); + assert!(output.contains("grch38='2:20-20'"), "{output}"); + assert!(output.contains("reference='A'"), "{output}"); + assert!(output.contains("alternate='AT'"), "{output}"); + assert!(output.contains("kind='insertion'"), "{output}"); + assert!(output.contains("deletion_length=3"), "{output}"); + assert!(output.contains("motifs=['AT', 'TA']"), "{output}"); + assert!(output.contains("kind='indel'"), "{output}"); + assert!(output.contains("kind='other'"), "{output}"); +} + +#[test] +fn runtime_direct_run_script_and_accessors_are_usable() { + let dir = temp_dir("direct-run-script"); + let runtime = BioscriptRuntime::with_config( + &dir, + RuntimeConfig { + loader: GenotypeLoadOptions::default(), + ..RuntimeConfig::default() + }, + ) + .unwrap(); + + assert_eq!(runtime.root(), dir.canonicalize().unwrap().as_path()); + assert!(runtime.config().loader.input_index.is_none()); + let result = runtime + .run_script("result = 2 + 3\nresult\n", "inline.py", Vec::new()) + .unwrap(); + assert!(matches!(result, MontyObject::Int(5))); +} + +#[test] +fn runtime_reports_filesystem_setup_errors() { + let dir = temp_dir("filesystem-errors"); + let missing_root = dir.join("missing-root"); + let Err(err) = BioscriptRuntime::new(&missing_root) else { + panic!("expected missing root to fail"); + }; + assert!( + err.to_string() + .contains("failed to canonicalize bioscript root"), + "{err}" + ); + + let runtime = BioscriptRuntime::new(&dir).unwrap(); + let err = runtime + .run_file(dir.join("missing-script.py"), None, Vec::new()) + .unwrap_err(); + assert!(err.to_string().contains("failed to read script"), "{err}"); +} + +#[test] +fn runtime_loader_paths_are_resolved_and_escape_checks_apply() { + let dir = temp_dir("loader-paths"); + fs::write( + dir.join("genotypes.txt"), + "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + + let runtime = BioscriptRuntime::with_config( + &dir, + RuntimeConfig { + loader: GenotypeLoadOptions { + format: Some(bioscript_formats::GenotypeSourceFormat::Cram), + input_index: Some(PathBuf::from("indexes/input.crai")), + reference_file: Some(PathBuf::from("refs/ref.fa")), + reference_index: Some(PathBuf::from("refs/ref.fa.fai")), + }, + ..RuntimeConfig::default() + }, + ) + .unwrap(); + let err = runtime + .run_file( + { + let script = dir.join("script.py"); + fs::write( + &script, + r#" +SNP = bioscript.variant(grch38="1:10-10", ref="A", alt="G", kind="snp") + +def main(): + genotypes = bioscript.load_genotypes("genotypes.txt") + genotypes.lookup_variant(SNP) + +if __name__ == "__main__": + main() +"#, + ) + .unwrap(); + script + }, + None, + Vec::new(), + ) + .unwrap_err(); + assert!( + err.to_string().contains("failed to open indexed FASTA"), + "{err}" + ); + + let err = run_script("bioscript.read_text('/tmp/outside.txt')\n").unwrap_err(); + assert!(err.contains("absolute paths are not allowed"), "{err}"); + let err = run_script("bioscript.write_text('../outside.txt', 'x')\n").unwrap_err(); + assert!(err.contains("path escapes bioscript root"), "{err}"); +} + +#[test] +fn runtime_write_tsv_reports_row_shape_errors() { + for (code, expected) in [ + ( + "bioscript.write_tsv('outputs/table.tsv', ['not a dict'])\n", + "write_tsv row must be a dict", + ), + ( + "bioscript.write_tsv('outputs/table.tsv', [{1: 'bad key'}])\n", + "write_tsv dict keys must be strings", + ), + ( + "bioscript.write_tsv('outputs/table.tsv')\n", + "bioscript.write_tsv expects self, path, rows", + ), + ] { + let err = run_script(code).unwrap_err(); + assert!(err.contains(expected), "{err}"); + } +} + +#[test] +fn runtime_variant_argument_conversions_cover_optional_and_list_errors() { + for (code, expected) in [ + ( + "bioscript.variant(rsids=['rs1', 2])\n", + "expected list of strings", + ), + ( + "bioscript.variant(grch38=123)\n", + "expected optional string", + ), + ( + "bioscript.variant(deletion_length='long')\n", + "expected optional int", + ), + ( + "bioscript.variant(motifs=123)\n", + "expected string or list of strings", + ), + ( + "bioscript.variant(grch38='1:not-a-start-20')\n", + "invalid locus start", + ), + ( + "bioscript.variant(grch38='1:10-not-an-end')\n", + "invalid locus end", + ), + ] { + let err = run_script(code).unwrap_err(); + assert!(err.contains(expected), "{err}"); + } +} From 8c050fb37e2f20dc69cd017f744ffb38d63ab81e Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 15:30:41 +1000 Subject: [PATCH 6/9] more fixes and coverage --- .github/workflows/coverage.yml | 2 +- coverage.sh | 8 +- docs/architecture.md | 2 +- rust/bioscript-cli/src/main.rs | 4 +- rust/bioscript-ffi/src/lib.rs | 68 +- rust/bioscript-formats/src/alignment.rs | 242 ++++++- rust/bioscript-formats/src/genotype.rs | 679 ++++++++++++++++-- rust/bioscript-formats/src/inspect.rs | 148 +++- rust/bioscript-formats/src/lib.rs | 1 + rust/bioscript-formats/tests/file_formats.rs | 17 +- rust/bioscript-runtime/src/runtime.rs | 215 +++++- rust/bioscript-runtime/tests/security.rs | 141 +++- .../tests/validate_variants.rs | 218 ++++++ rust/bioscript-wasm/src/lib.rs | 21 + 14 files changed, 1650 insertions(+), 116 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 4ed7f47..61a5cee 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - test: [file_formats, inspect, prepare, cli, schema, core, runtime_security, runtime_resources] + test: [file_formats, formats_lib, inspect, prepare, cli, schema, core, runtime_security, runtime_resources] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/coverage.sh b/coverage.sh index 9cb9d94..42046dc 100755 --- a/coverage.sh +++ b/coverage.sh @@ -24,8 +24,8 @@ Usage: ./coverage.sh [--full-clean|-c] [--open] [--large] [--all-tests] [--no-li --all-tests Run all tests for the first-party BioScript crates --no-lint Skip cargo fmt and clippy checks --focused-test Run one focused integration test target: - file_formats, inspect, prepare, cli, schema, core, runtime_security, - or runtime_resources + file_formats, formats_lib, inspect, prepare, cli, schema, core, + runtime_security, or runtime_resources Environment: AUTO_INSTALL_LLVM_COV=0 Do not auto-install cargo-llvm-cov @@ -160,6 +160,9 @@ if [[ -n "$FOCUSED_TEST" ]]; then file_formats) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test file_formats -- --nocapture --test-threads="$TEST_THREADS" ;; + formats_lib) + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --lib + ;; inspect) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test inspect -- --nocapture --test-threads="$TEST_THREADS" ;; @@ -191,6 +194,7 @@ elif [[ "$ALL_TESTS_FLAG" == "1" ]]; then env "${COV_ENV[@]}" cargo llvm-cov --no-report "${PKG_ARGS[@]}" --all-targets else env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test file_formats -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --lib env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test inspect -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test prepare -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --test cli -- --nocapture --test-threads="$TEST_THREADS" diff --git a/docs/architecture.md b/docs/architecture.md index d822646..3b39f7c 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -58,7 +58,7 @@ Given a single SNP or indel locus, the read path is: 5. For each selected slice: - `decode_blocks()` — decompresses the CRAM blocks once per slice. - `records_while(..., validate_reference_md5 = true, on_record)` — streams records one at a time. For each record we construct an `AlignmentRecord` (start/end/sequence/CIGAR) and either skip it (outside the interval), forward it to the caller, or **stop** (once `record.start > locus.end`, since slices are coordinate-sorted). -6. On `reference sequence checksum mismatch`, the call is retried with `validate_reference_md5 = false`, a loud warning is written to `stderr`, and decoding proceeds. Results may be wrong at positions where the supplied FASTA actually differs from the encoding reference — the warning tells the user to investigate. +6. On `reference sequence checksum mismatch`, decoding fails closed by default. Callers may explicitly pass `--allow-md5-mismatch` to retry with `validate_reference_md5 = false`; in that mode a loud warning is written to `stderr`, and results may be wrong at positions where the supplied FASTA actually differs from the encoding reference. Compared to calling upstream `Slice::records()` directly, the streaming path turns decoding ~10 000 records into decoding ~40 — roughly three orders of magnitude less work per locus. diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 37a34b2..844f7ef 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -129,6 +129,8 @@ fn run_cli() -> Result<(), String> { return Err("--reference-index requires a path".to_owned()); }; loader.reference_index = Some(PathBuf::from(value)); + } else if arg == "--allow-md5-mismatch" { + loader.allow_reference_md5_mismatch = true; } else if arg == "--max-duration-ms" { let Some(value) = args.next() else { return Err("--max-duration-ms requires an integer".to_owned()); @@ -177,7 +179,7 @@ fn run_cli() -> Result<(), String> { let Some(script_path) = script_path else { return Err( - "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--allow-md5-mismatch] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" .to_owned(), ); }; diff --git a/rust/bioscript-ffi/src/lib.rs b/rust/bioscript-ffi/src/lib.rs index 2909d16..75e0c0b 100644 --- a/rust/bioscript-ffi/src/lib.rs +++ b/rust/bioscript-ffi/src/lib.rs @@ -15,6 +15,15 @@ use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; use monty::{MontyObject, ResourceLimits}; use serde::{Deserialize, Serialize}; +const DEFAULT_MAX_DURATION_MS: u64 = 100; +const DEFAULT_MAX_MEMORY_BYTES: usize = 8 * 1024 * 1024; +const DEFAULT_MAX_ALLOCATIONS: usize = 200_000; +const DEFAULT_MAX_RECURSION_DEPTH: usize = 200; +const HARD_MAX_DURATION_MS: u64 = 60_000; +const HARD_MAX_MEMORY_BYTES: usize = 256 * 1024 * 1024; +const HARD_MAX_ALLOCATIONS: usize = 10_000_000; +const HARD_MAX_RECURSION_DEPTH: usize = 10_000; + #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct RunFileRequest { @@ -29,6 +38,7 @@ pub struct RunFileRequest { pub input_index: Option, pub reference_file: Option, pub reference_index: Option, + pub allow_md5_mismatch: Option, pub auto_index: Option, pub cache_dir: Option, pub max_duration_ms: Option, @@ -168,28 +178,29 @@ fn build_loader(request: &RunFileRequest) -> Result loader.input_index = request.input_index.clone().map(PathBuf::from); loader.reference_file = request.reference_file.clone().map(PathBuf::from); loader.reference_index = request.reference_index.clone().map(PathBuf::from); + loader.allow_reference_md5_mismatch = request.allow_md5_mismatch.unwrap_or(false); Ok(loader) } fn build_limits(request: &RunFileRequest) -> ResourceLimits { let mut limits = ResourceLimits::new() - .max_duration(Duration::from_millis(100)) - .max_memory(8 * 1024 * 1024) - .max_allocations(200_000) + .max_duration(Duration::from_millis(DEFAULT_MAX_DURATION_MS)) + .max_memory(DEFAULT_MAX_MEMORY_BYTES) + .max_allocations(DEFAULT_MAX_ALLOCATIONS) .gc_interval(1000) - .max_recursion_depth(Some(200)); + .max_recursion_depth(Some(DEFAULT_MAX_RECURSION_DEPTH)); if let Some(value) = request.max_duration_ms { - limits = limits.max_duration(Duration::from_millis(value)); + limits = limits.max_duration(Duration::from_millis(value.min(HARD_MAX_DURATION_MS))); } if let Some(value) = request.max_memory_bytes { - limits = limits.max_memory(value); + limits = limits.max_memory(value.min(HARD_MAX_MEMORY_BYTES)); } if let Some(value) = request.max_allocations { - limits = limits.max_allocations(value); + limits = limits.max_allocations(value.min(HARD_MAX_ALLOCATIONS)); } if let Some(value) = request.max_recursion_depth { - limits = limits.max_recursion_depth(Some(value)); + limits = limits.max_recursion_depth(Some(value.min(HARD_MAX_RECURSION_DEPTH))); } limits @@ -331,3 +342,44 @@ pub mod android { .expect("jni new_string should succeed") } } + +#[cfg(test)] +mod tests { + use super::*; + + fn request_with_limits() -> RunFileRequest { + RunFileRequest { + script_path: "script.py".to_owned(), + root: None, + input_file: None, + output_file: None, + participant_id: None, + trace_report_path: None, + timing_report_path: None, + input_format: None, + input_index: None, + reference_file: None, + reference_index: None, + allow_md5_mismatch: None, + auto_index: None, + cache_dir: None, + max_duration_ms: Some(u64::MAX), + max_memory_bytes: Some(usize::MAX), + max_allocations: Some(usize::MAX), + max_recursion_depth: Some(usize::MAX), + } + } + + #[test] + fn ffi_resource_limits_are_clamped_to_hard_ceilings() { + let limits = build_limits(&request_with_limits()); + + assert_eq!( + limits.max_duration, + Some(Duration::from_millis(HARD_MAX_DURATION_MS)) + ); + assert_eq!(limits.max_memory, Some(HARD_MAX_MEMORY_BYTES)); + assert_eq!(limits.max_allocations, Some(HARD_MAX_ALLOCATIONS)); + assert_eq!(limits.max_recursion_depth, Some(HARD_MAX_RECURSION_DEPTH)); + } +} diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index 59f2568..2b4901f 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -65,7 +65,13 @@ where let repository = build_reference_repository(reference_file)?; let mut reader = build_cram_indexed_reader_from_path(path, options, repository)?; let label = path.display().to_string(); - for_each_cram_record_with_reader(&mut reader, &label, locus, on_record) + for_each_cram_record_with_reader_inner( + &mut reader, + &label, + locus, + options.allow_reference_md5_mismatch, + on_record, + ) } pub(crate) fn query_cram_records( @@ -89,6 +95,20 @@ pub fn for_each_cram_record_with_reader( reader: &mut cram::io::indexed_reader::IndexedReader, label: &str, locus: &GenomicLocus, + on_record: F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(AlignmentRecord) -> Result, +{ + for_each_cram_record_with_reader_inner(reader, label, locus, false, on_record) +} + +fn for_each_cram_record_with_reader_inner( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + allow_reference_md5_mismatch: bool, mut on_record: F, ) -> Result<(), RuntimeError> where @@ -122,6 +142,7 @@ where ®ion, locus.end, &selected_containers, + allow_reference_md5_mismatch, &mut on_record, ) } @@ -134,6 +155,20 @@ pub fn for_each_raw_cram_record_with_reader( reader: &mut cram::io::indexed_reader::IndexedReader, label: &str, locus: &GenomicLocus, + on_record: F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(cram::Record<'_>) -> Result, +{ + for_each_raw_cram_record_with_reader_inner(reader, label, locus, false, on_record) +} + +pub(crate) fn for_each_raw_cram_record_with_reader_inner( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + allow_reference_md5_mismatch: bool, mut on_record: F, ) -> Result<(), RuntimeError> where @@ -169,6 +204,7 @@ where ®ion, locus.end, &selected_containers, + allow_reference_md5_mismatch, &mut on_record, ) } @@ -326,6 +362,7 @@ fn stream_selected_alignment_records( region: &Region, locus_end: i64, selected_containers: &[SelectedContainer], + allow_reference_md5_mismatch: bool, on_record: &mut F, ) -> Result<(), RuntimeError> where @@ -339,6 +376,7 @@ where region, locus_end, selected_containers, + allow_reference_md5_mismatch, &mut |record| { let alignment_record = build_alignment_record_from_cram(label, &record)?; on_record(alignment_record) @@ -353,6 +391,7 @@ fn stream_selected_cram_records( region: &Region, locus_end: i64, selected_containers: &[SelectedContainer], + allow_reference_md5_mismatch: bool, on_record: &mut F, ) -> Result<(), RuntimeError> where @@ -459,7 +498,7 @@ where match decode_result { Ok(()) => {} - Err(err) if is_reference_md5_mismatch(&err) => { + Err(err) if allow_reference_md5_mismatch && is_reference_md5_mismatch(&err) => { eprintln!( "[bioscript] warning: CRAM reference MD5 mismatch for {label} slice landmark {landmark} — \ retrying without checksum validation. Results may be incorrect if the \ @@ -517,6 +556,11 @@ where )) })?; } + Err(err) if is_reference_md5_mismatch(&err) => { + return Err(RuntimeError::Io(format!( + "CRAM reference MD5 mismatch for {label} slice landmark {landmark}; rerun with --allow-md5-mismatch only if this lenient decode is intentional. Details: {err}" + ))); + } Err(err) => { return Err(RuntimeError::Io(format!( "failed to decode CRAM slice records from {label}: {err}" @@ -679,3 +723,197 @@ fn map_op(op: sam::alignment::record::cigar::Op) -> AlignmentOp { len: op.len(), } } + +#[cfg(test)] +mod tests { + use super::*; + use std::num::NonZero; + + use noodles::sam::{ + self, + alignment::record::cigar::{Op, op::Kind}, + header::record::value::{Map, map::ReferenceSequence}, + }; + + fn locus(chrom: &str, start: i64, end: i64) -> GenomicLocus { + GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + } + } + + fn header() -> sam::Header { + sam::Header::builder() + .add_reference_sequence( + "chr1", + Map::::new(NonZero::new(100).unwrap()), + ) + .add_reference_sequence( + "2", + Map::::new(NonZero::new(200).unwrap()), + ) + .build() + } + + #[test] + fn alignment_helpers_cover_header_region_and_interval_logic() { + let header = header(); + assert_eq!( + resolve_reference_name(&header, "1").as_deref(), + Some("chr1") + ); + assert_eq!( + resolve_reference_name(&header, "chr2").as_deref(), + Some("2") + ); + assert_eq!(resolve_reference_name(&header, "3"), None); + assert_eq!(resolve_reference_sequence_id(&header, b"chr1"), Some(0)); + assert_eq!(resolve_reference_sequence_id(&header, b"2"), Some(1)); + assert_eq!(resolve_reference_sequence_id(&header, b"missing"), None); + + let region = build_region(&header, &locus("1", 10, 20)).unwrap(); + assert_eq!(region.name(), b"chr1"); + assert!(build_region(&header, &locus("missing", 10, 20)).is_none()); + assert!(build_region(&header, &locus("1", -1, 20)).is_none()); + + let interval = region.interval(); + let hit = crai::Record::new(Some(0), Position::new(12), 3, 100, 1, 20); + let miss_ref = crai::Record::new(Some(1), Position::new(12), 3, 100, 1, 20); + let no_start = crai::Record::new(Some(0), None, 3, 100, 1, 20); + let zero_span = crai::Record::new(Some(0), Position::new(12), 0, 100, 1, 20); + assert!(record_intersects_interval(&hit, interval)); + assert!(record_intersects_interval(&miss_ref, interval)); + assert!(!record_intersects_interval(&no_start, interval)); + assert!(!record_intersects_interval(&zero_span, interval)); + + let alignment_hit = AlignmentRecord { + start: 11, + end: 13, + is_unmapped: false, + cigar: Vec::new(), + }; + let alignment_miss = AlignmentRecord { + start: 30, + end: 40, + is_unmapped: false, + cigar: Vec::new(), + }; + let bad_start = AlignmentRecord { + start: -1, + end: 10, + is_unmapped: false, + cigar: Vec::new(), + }; + assert!(alignment_record_intersects_interval( + &alignment_hit, + interval + )); + assert!(!alignment_record_intersects_interval( + &alignment_miss, + interval + )); + assert!(!alignment_record_intersects_interval(&bad_start, interval)); + } + + #[test] + fn alignment_helpers_cover_index_selection_and_operation_mapping() { + let header = header(); + let region = build_region(&header, &locus("1", 10, 20)).unwrap(); + let index = vec![ + crai::Record::new(Some(0), Position::new(8), 5, 100, 1, 20), + crai::Record::new(Some(0), Position::new(19), 3, 100, 2, 20), + crai::Record::new(Some(1), Position::new(12), 3, 200, 3, 20), + crai::Record::new(Some(0), Position::new(30), 3, 300, 4, 20), + ]; + let selected = select_query_containers(&index, &header, ®ion).unwrap(); + assert_eq!(selected.len(), 1); + assert_eq!(selected[0].offset, 100); + assert!(selected[0].landmarks.contains(&1)); + assert!(selected[0].landmarks.contains(&2)); + + let missing_region: Region = "missing:1-2".parse().unwrap(); + let err = select_query_containers(&index, &header, &missing_region).unwrap_err(); + assert!(err.to_string().contains("does not contain contig")); + + let cases = [ + (Kind::Match, AlignmentOpKind::Match), + (Kind::Insertion, AlignmentOpKind::Insertion), + (Kind::Deletion, AlignmentOpKind::Deletion), + (Kind::Skip, AlignmentOpKind::Skip), + (Kind::SoftClip, AlignmentOpKind::SoftClip), + (Kind::HardClip, AlignmentOpKind::HardClip), + (Kind::Pad, AlignmentOpKind::Pad), + (Kind::SequenceMatch, AlignmentOpKind::SequenceMatch), + (Kind::SequenceMismatch, AlignmentOpKind::SequenceMismatch), + ]; + for (kind, expected) in cases { + assert_eq!( + map_op(Op::new(kind, 7)), + AlignmentOp { + kind: expected, + len: 7 + } + ); + } + } + + #[test] + fn alignment_helpers_cover_parser_and_builder_errors() { + assert!( + parse_crai_bytes(b"not a crai") + .unwrap_err() + .to_string() + .contains("CRAM index") + ); + assert!( + parse_fai_bytes(b"not a fai") + .unwrap_err() + .to_string() + .contains("FASTA index") + ); + assert!( + parse_tbi_bytes(b"not a tbi") + .unwrap_err() + .to_string() + .contains("tabix index") + ); + assert!( + build_reference_repository(Path::new("/definitely/missing/reference.fa")) + .unwrap_err() + .to_string() + .contains("failed to open indexed FASTA") + ); + + let repository = build_reference_repository_from_readers( + std::io::Cursor::new(Vec::::new()), + fasta::fai::Index::default(), + ); + let options = GenotypeLoadOptions { + input_index: Some(Path::new("/definitely/missing/input.crai").to_path_buf()), + ..GenotypeLoadOptions::default() + }; + let Err(err) = + build_cram_indexed_reader_from_path(Path::new("sample.cram"), &options, repository) + else { + panic!("expected missing CRAM index to fail"); + }; + assert!(err.to_string().contains("failed to read CRAM index")); + + let repository = build_reference_repository_from_readers( + std::io::Cursor::new(Vec::::new()), + fasta::fai::Index::default(), + ); + let reader = build_cram_indexed_reader_from_reader( + std::io::Cursor::new(Vec::::new()), + crai::Index::default(), + repository, + ); + assert!(reader.is_ok()); + + let err = std::io::Error::other("reference sequence checksum mismatch: expected"); + assert!(is_reference_md5_mismatch(&err)); + let err = std::io::Error::other("other decode error"); + assert!(!is_reference_md5_mismatch(&err)); + } +} diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 59d6acd..ae2091a 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -24,6 +24,7 @@ use crate::alignment::{self, AlignmentOpKind, AlignmentRecord}; const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; const DEFAULT_MPILEUP_MIN_BASE_QUALITY: u8 = 13; const DEFAULT_MPILEUP_MIN_MAPPING_QUALITY: u8 = 0; +const MAX_ZIP_ENTRY_BYTES: u64 = 128 * 1024 * 1024; const RSID_ALIASES: &[&str] = &["rsid", "name", "snp", "marker", "id", "snpid"]; const CHROM_ALIASES: &[&str] = &["chromosome", "chr", "chrom"]; @@ -127,6 +128,7 @@ pub struct GenotypeLoadOptions { pub input_index: Option, pub reference_file: Option, pub reference_index: Option, + pub allow_reference_md5_mismatch: bool, } impl GenotypeStore { @@ -197,12 +199,11 @@ impl GenotypeStore { "failed to open genotype entry {selected} in {name}: {err}" )) })?; - let mut contents = Vec::new(); - entry.read_to_end(&mut contents).map_err(|err| { - RuntimeError::Io(format!( - "failed to read genotype entry {selected} in {name}: {err}" - )) - })?; + let contents = read_zip_entry_limited( + &mut entry, + MAX_ZIP_ENTRY_BYTES, + &format!("genotype entry {selected} in {name}"), + )?; let lines = read_lines_from_reader(BufReader::new(Cursor::new(contents)), Path::new(&selected))?; if selected.to_ascii_lowercase().ends_with(".vcf") { @@ -973,7 +974,14 @@ fn observe_snp_pileup( let mut reader = alignment::build_cram_indexed_reader_from_path(cram_path, options, repository)?; let label = cram_path.display().to_string(); - snp_pileup_with_reader(&mut reader, &label, locus, reference, alternate) + snp_pileup_with_reader( + &mut reader, + &label, + locus, + reference, + alternate, + options.allow_reference_md5_mismatch, + ) } fn snp_pileup_with_reader( @@ -982,6 +990,7 @@ fn snp_pileup_with_reader( locus: &GenomicLocus, reference: char, alternate: char, + allow_reference_md5_mismatch: bool, ) -> Result { let mut counts = SnpPileupCounts::default(); let target_position = Position::try_from(usize::try_from(locus.start).map_err(|_| { @@ -990,80 +999,86 @@ fn snp_pileup_with_reader( .map_err(|_| RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()))?; let reference_base = reference as u8; - alignment::for_each_raw_cram_record_with_reader(reader, label, locus, |record| { - let flags = record - .flags() - .map_err(|err| RuntimeError::Io(format!("failed to read CRAM flags: {err}")))?; - if flags.is_unmapped() { - counts.filtered_unmapped += 1; - return Ok(true); - } - if flags.is_secondary() { - counts.filtered_secondary += 1; - return Ok(true); - } - if flags.is_qc_fail() { - counts.filtered_qc_fail += 1; - return Ok(true); - } - if flags.is_duplicate() { - counts.filtered_duplicate += 1; - return Ok(true); - } - if flags.is_segmented() && !flags.is_properly_segmented() { - counts.filtered_improper_pair += 1; - return Ok(true); - } - - let Some((base, base_quality)) = - record.base_quality_at_reference_position(target_position, reference_base) - else { - return Ok(true); - }; + alignment::for_each_raw_cram_record_with_reader_inner( + reader, + label, + locus, + allow_reference_md5_mismatch, + |record| { + let flags = record + .flags() + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM flags: {err}")))?; + if flags.is_unmapped() { + counts.filtered_unmapped += 1; + return Ok(true); + } + if flags.is_secondary() { + counts.filtered_secondary += 1; + return Ok(true); + } + if flags.is_qc_fail() { + counts.filtered_qc_fail += 1; + return Ok(true); + } + if flags.is_duplicate() { + counts.filtered_duplicate += 1; + return Ok(true); + } + if flags.is_segmented() && !flags.is_properly_segmented() { + counts.filtered_improper_pair += 1; + return Ok(true); + } - let normalized_base = normalize_pileup_base(base); - record.mapping_quality().transpose().map_err(|err| { - RuntimeError::Io(format!("failed to read CRAM mapping quality: {err}")) - })?; - let is_reverse = flags.is_reverse_complemented(); - if let Some(base) = normalized_base { - counts.raw_depth += 1; - *counts.raw_base_counts.entry(base.to_string()).or_insert(0) += 1; - let strand_counts = if is_reverse { - &mut counts.raw_reverse_counts - } else { - &mut counts.raw_forward_counts + let Some((base, base_quality)) = + record.base_quality_at_reference_position(target_position, reference_base) + else { + return Ok(true); }; - *strand_counts.entry(base.to_string()).or_insert(0) += 1; - if base == reference { - counts.raw_ref_count += 1; - } else if base == alternate { - counts.raw_alt_count += 1; + + let normalized_base = normalize_pileup_base(base); + record.mapping_quality().transpose().map_err(|err| { + RuntimeError::Io(format!("failed to read CRAM mapping quality: {err}")) + })?; + let is_reverse = flags.is_reverse_complemented(); + if let Some(base) = normalized_base { + counts.raw_depth += 1; + *counts.raw_base_counts.entry(base.to_string()).or_insert(0) += 1; + let strand_counts = if is_reverse { + &mut counts.raw_reverse_counts + } else { + &mut counts.raw_forward_counts + }; + *strand_counts.entry(base.to_string()).or_insert(0) += 1; + if base == reference { + counts.raw_ref_count += 1; + } else if base == alternate { + counts.raw_alt_count += 1; + } } - } - if base_quality < DEFAULT_MPILEUP_MIN_BASE_QUALITY { - counts.filtered_low_base_quality += 1; - return Ok(true); - } + if base_quality < DEFAULT_MPILEUP_MIN_BASE_QUALITY { + counts.filtered_low_base_quality += 1; + return Ok(true); + } - let Some(base) = normalized_base else { - counts.filtered_non_acgt += 1; - return Ok(true); - }; + let Some(base) = normalized_base else { + counts.filtered_non_acgt += 1; + return Ok(true); + }; - counts.filtered_depth += 1; - *counts - .filtered_base_counts - .entry(base.to_string()) - .or_insert(0) += 1; - if base == reference { - counts.filtered_ref_count += 1; - } else if base == alternate { - counts.filtered_alt_count += 1; - } - Ok(true) - })?; + counts.filtered_depth += 1; + *counts + .filtered_base_counts + .entry(base.to_string()) + .or_insert(0) += 1; + if base == reference { + counts.filtered_ref_count += 1; + } else if base == alternate { + counts.filtered_alt_count += 1; + } + Ok(true) + }, + )?; Ok(counts) } @@ -1085,7 +1100,7 @@ pub fn observe_cram_snp_with_reader( matched_rsid: Option, assembly: Option, ) -> Result { - let pileup = snp_pileup_with_reader(reader, label, locus, reference, alternate)?; + let pileup = snp_pileup_with_reader(reader, label, locus, reference, alternate, false)?; let ref_count = pileup.filtered_ref_count; let alt_count = pileup.filtered_alt_count; let depth = pileup.filtered_depth; @@ -2471,6 +2486,24 @@ fn read_lines_from_reader( Ok(lines) } +fn read_zip_entry_limited( + reader: &mut R, + max_bytes: u64, + label: &str, +) -> Result, RuntimeError> { + let mut contents = Vec::new(); + reader + .take(max_bytes.saturating_add(1)) + .read_to_end(&mut contents) + .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; + if u64::try_from(contents.len()).unwrap_or(u64::MAX) > max_bytes { + return Err(RuntimeError::InvalidArguments(format!( + "{label} exceeds decompressed limit of {max_bytes} bytes" + ))); + } + Ok(contents) +} + fn detect_source_format( path: &Path, forced: Option, @@ -2572,3 +2605,497 @@ fn is_symbolic_vcf_alt(alternate: &str) -> bool { fn normalize_sequence_token(value: &str) -> String { value.trim().to_ascii_uppercase() } + +#[cfg(test)] +mod tests { + use super::*; + use std::{ + fs, + io::Write, + time::{SystemTime, UNIX_EPOCH}, + }; + + use zip::write::SimpleFileOptions; + + use crate::alignment::AlignmentOp; + + fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-genotype-{label}-{}-{nanos}", + std::process::id() + )); + fs::create_dir_all(&dir).unwrap(); + dir + } + + fn locus(chrom: &str, start: i64, end: i64) -> GenomicLocus { + GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + } + } + + fn variant_with_loci() -> VariantSpec { + VariantSpec { + rsids: vec!["rs1".to_owned()], + grch37: Some(locus("1", 10, 10)), + grch38: Some(locus("2", 20, 20)), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + deletion_length: None, + motifs: Vec::new(), + } + } + + #[test] + fn genotype_private_helpers_cover_assembly_sorting_and_decision_rules() { + let variant = variant_with_loci(); + + assert_eq!( + choose_variant_locus(&variant, Path::new("ref/hg38.fa")), + Some((Assembly::Grch38, locus("2", 20, 20))) + ); + assert_eq!( + choose_variant_locus(&variant, Path::new("ref/hg19.fa")), + Some((Assembly::Grch37, locus("1", 10, 10))) + ); + assert_eq!( + choose_variant_locus(&variant, Path::new("ref/unknown.fa")), + Some((Assembly::Grch38, locus("2", 20, 20))) + ); + assert_eq!( + choose_variant_locus_for_assembly(&variant, Some(Assembly::Grch37)), + Some(locus("1", 10, 10)) + ); + assert_eq!( + detect_reference_assembly(Path::new("assembly37.fa")), + Some(Assembly::Grch37) + ); + assert_eq!( + detect_reference_assembly(Path::new("assembly38.fa")), + Some(Assembly::Grch38) + ); + assert_eq!(detect_reference_assembly(Path::new("other.fa")), None); + + assert_eq!(describe_locus(&locus("chr1", 7, 9)), "chr1:7-9"); + assert_eq!(anchor_window(&locus("1", 1, 4)), locus("1", 0, 0)); + assert_eq!(first_base(" tg"), Some('T')); + assert_eq!(first_base(""), None); + + assert_eq!(infer_snp_genotype('A', 'G', 0, 0, 0), None); + assert_eq!( + infer_snp_genotype('A', 'G', 9, 1, 10).as_deref(), + Some("AA") + ); + assert_eq!( + infer_snp_genotype('A', 'G', 1, 9, 10).as_deref(), + Some("GG") + ); + assert_eq!( + infer_snp_genotype('A', 'G', 5, 5, 10).as_deref(), + Some("AG") + ); + assert!(describe_snp_decision_rule('A', 'G', 0, 0, 0).contains("no covering reads")); + assert!(describe_snp_decision_rule('A', 'G', 0, 0, 3).contains("no reads matched")); + assert!(describe_snp_decision_rule('A', 'G', 2, 8, 10).contains("alt_fraction=0.800")); + + assert_eq!(infer_copy_number_genotype("I", "D", 0, 0, 0), None); + assert_eq!( + infer_copy_number_genotype("I", "D", 9, 1, 10).as_deref(), + Some("II") + ); + assert_eq!( + infer_copy_number_genotype("I", "D", 1, 9, 10).as_deref(), + Some("DD") + ); + assert_eq!( + infer_copy_number_genotype("I", "D", 5, 5, 10).as_deref(), + Some("ID") + ); + assert!( + describe_copy_number_decision_rule("I", "D", 0, 0, 0).contains("no covering reads") + ); + + assert_eq!(chrom_sort_key("chr2"), "002"); + assert_eq!(chrom_sort_key("X"), "023"); + assert_eq!(chrom_sort_key("MT"), "025"); + assert_eq!(chrom_sort_key("GL0001"), "999-GL0001"); + assert_eq!(variant_sort_key(&variant).0, 0); + assert_eq!(describe_query(&variant), "variant_by_locus"); + assert_eq!(describe_query(&VariantSpec::default()), "variant_by_rsid"); + } + + #[test] + fn genotype_private_helpers_cover_row_parsing_and_normalization() { + assert!(matches!( + detect_delimiter(&["# skip".to_owned(), "a,b".to_owned()]), + Delimiter::Comma + )); + assert!(matches!( + detect_delimiter(&["a b".to_owned()]), + Delimiter::Space + )); + assert!(matches!(detect_delimiter(&Vec::new()), Delimiter::Tab)); + + assert_eq!(strip_bom("\u{feff}rs1"), "rs1"); + assert_eq!(normalize_name("Base Pair-Position"), "basepairposition"); + assert_eq!(strip_inline_comment("AG # note"), "AG"); + assert_eq!(strip_inline_comment("AG // note"), "AG"); + assert_eq!(normalize_genotype("n/a"), "--"); + assert_eq!(normalize_genotype("a / g"), "AG"); + assert_eq!(normalize_genotype("A/-"), "ID"); + assert_eq!(split_csv_line(r#"rs1,"1,2",AG"#), vec!["rs1", "1,2", "AG"]); + + let mut parser = RowParser::new(Delimiter::Comma); + assert!( + parser + .consume_record("# snpid,chr,pos,allele_a,allele_b") + .unwrap() + .is_none() + ); + let row = parser.consume_record("rs1,1,10,A,G").unwrap().unwrap(); + assert_eq!(row.rsid.as_deref(), Some("rs1")); + assert_eq!(row.chrom.as_deref(), Some("1")); + assert_eq!(row.position, Some(10)); + assert_eq!(row.genotype, "AG"); + let short_row = parser.consume_record("bad,row").unwrap().unwrap(); + assert_eq!(short_row.rsid.as_deref(), Some("bad")); + assert_eq!(short_row.genotype, "--"); + assert_eq!(parser.default_header(6).len(), 6); + + let mut indexes = None; + let mut comment_header = None; + assert!( + parse_streaming_row( + "// marker chromosome position result", + Delimiter::Space, + &mut indexes, + &mut comment_header + ) + .unwrap() + .is_none() + ); + let row = parse_streaming_row( + "rs2 chr2 20 ct", + Delimiter::Space, + &mut indexes, + &mut comment_header, + ) + .unwrap() + .unwrap(); + assert_eq!(row.rsid.as_deref(), Some("rs2")); + assert_eq!(row.genotype, "CT"); + + let header = vec![ + "marker".to_owned(), + "chrom".to_owned(), + "base_pair_position".to_owned(), + "allele1".to_owned(), + "allele2".to_owned(), + ]; + let cols = build_column_indexes(&header); + assert_eq!(cols.rsid, Some(0)); + assert_eq!(cols.chrom, Some(1)); + assert_eq!(cols.position, Some(2)); + assert_eq!(cols.allele1, Some(3)); + assert_eq!(cols.allele2, Some(4)); + assert_eq!(default_column_indexes(2).position, None); + assert_eq!(find_header_index(&header, GENOTYPE_ALIASES), None); + assert!(looks_like_header_fields(&["rsid".to_owned()])); + assert!(!looks_like_header_fields(&["sample".to_owned()])); + } + + #[test] + fn genotype_private_helpers_cover_vcf_parsing_and_matching() { + assert!(parse_vcf_record("").unwrap().is_none()); + assert!(parse_vcf_record("#CHROM\tPOS").unwrap().is_none()); + assert!(parse_vcf_record("1\t10\trs1").unwrap().is_none()); + assert!( + parse_vcf_record("1\t10\trs1\t.\tG\t.\tPASS\t.\tGT\t0/1") + .unwrap() + .is_none() + ); + assert!( + parse_vcf_record("1\t10\trs1\tA\t.\t.\tPASS\t.\tGT\t0/1") + .unwrap() + .is_none() + ); + assert!(parse_vcf_record("1\tbad\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1").is_err()); + + let row = parse_vcf_record("chr1\t10\trs1\tA\tG,T\t.\tPASS\t.\tDP:GT\t8:1|2") + .unwrap() + .unwrap(); + assert_eq!(row.rsid.as_deref(), Some("rs1")); + assert_eq!(row.genotype, "GT"); + assert_eq!( + extract_vcf_sample_genotype("DP:AD", "8:1,2", "A", &["G".to_owned()]), + None + ); + assert_eq!( + genotype_from_vcf_gt(".", "A", &["G"]).as_deref(), + Some("--") + ); + assert_eq!( + genotype_from_vcf_gt("./1", "A", &["G"]).as_deref(), + Some("--") + ); + assert_eq!( + genotype_from_vcf_gt("bad", "A", &["G"]).as_deref(), + Some("--") + ); + assert_eq!(genotype_from_vcf_gt("2/2", "A", &["G"]), None); + assert_eq!(vcf_reference_token("AT", &["A"]), "I"); + assert_eq!(vcf_reference_token("A", &["AT"]), "D"); + assert_eq!(vcf_reference_token("A", &[""]), "A"); + assert_eq!(vcf_alt_token("AT", "A"), "D"); + assert_eq!(vcf_alt_token("A", "AT"), "I"); + assert_eq!(vcf_alt_token("A", ""), "--"); + assert!(is_symbolic_vcf_alt("")); + assert_eq!(normalize_sequence_token(" ag "), "AG"); + + assert_eq!( + detect_vcf_assembly(Path::new("sample.vcf"), &["##reference=hg19".to_owned()]), + Some(Assembly::Grch37) + ); + assert_eq!( + detect_vcf_assembly(Path::new("sample.vcf"), &["##assembly=GRCh38".to_owned()]), + Some(Assembly::Grch38) + ); + assert_eq!( + detect_vcf_assembly(Path::new("sample.b37.vcf"), &[]), + Some(Assembly::Grch37) + ); + assert_eq!( + detect_vcf_assembly(Path::new("sample.b38.vcf"), &[]), + Some(Assembly::Grch38) + ); + assert_eq!(normalize_chromosome_name("chrX"), "x"); + + let snp = VariantSpec { + grch38: Some(locus("1", 10, 10)), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }; + assert!(vcf_row_matches_variant(&row, &snp, Some(Assembly::Grch38))); + let deletion_row = parse_vcf_record("1\t9\trsdel\tATC\tA\t.\tPASS\t.\tGT\t0/1") + .unwrap() + .unwrap(); + let deletion = VariantSpec { + grch38: Some(locus("1", 10, 12)), + kind: Some(VariantKind::Deletion), + deletion_length: Some(2), + ..VariantSpec::default() + }; + assert!(vcf_row_matches_variant( + &deletion_row, + &deletion, + Some(Assembly::Grch38) + )); + let insertion = VariantSpec { + grch38: Some(locus("1", 10, 10)), + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }; + assert!(vcf_row_matches_variant( + &deletion_row, + &insertion, + Some(Assembly::Grch38) + )); + assert!(!vcf_row_matches_variant( + &row, + &VariantSpec::default(), + None + )); + } + + #[test] + fn genotype_private_helpers_cover_indel_record_classification() { + let record = AlignmentRecord { + start: 10, + end: 20, + is_unmapped: false, + cigar: vec![ + AlignmentOp { + kind: AlignmentOpKind::Match, + len: 2, + }, + AlignmentOp { + kind: AlignmentOpKind::Insertion, + len: 3, + }, + AlignmentOp { + kind: AlignmentOpKind::Deletion, + len: 2, + }, + AlignmentOp { + kind: AlignmentOpKind::SoftClip, + len: 4, + }, + ], + }; + assert!(spans_position(&record, 9)); + assert!(record_overlaps_locus(&record, &locus("1", 15, 16))); + assert_eq!( + indel_at_anchor(&record, 11), + Some((AlignmentOpKind::Insertion, 3)) + ); + let deletion_record = AlignmentRecord { + start: 10, + end: 20, + is_unmapped: false, + cigar: vec![ + AlignmentOp { + kind: AlignmentOpKind::Match, + len: 3, + }, + AlignmentOp { + kind: AlignmentOpKind::Deletion, + len: 2, + }, + ], + }; + assert_eq!( + indel_at_anchor(&deletion_record, 12), + Some((AlignmentOpKind::Deletion, 2)) + ); + assert_eq!(indel_at_anchor(&record, 30), None); + assert_eq!(len_as_i64(usize::MAX), None); + + let insertion = classify_expected_indel(&record, &locus("1", 12, 12), 1, "ATGC").unwrap(); + assert!(insertion.covering); + assert!(insertion.matches_alt); + assert_eq!(insertion.observed_len, 4); + let deletion = + classify_expected_indel(&deletion_record, &locus("1", 13, 13), 3, "A").unwrap(); + assert!(deletion.matches_alt); + assert_eq!(deletion.observed_len, 1); + let reference_like = + classify_expected_indel(&record, &locus("1", 18, 18), 1, "AT").unwrap(); + assert!(reference_like.reference_like); + let not_covering = classify_expected_indel(&record, &locus("1", 1, 2), 2, "A").unwrap(); + assert!(!not_covering.covering); + + assert_eq!(normalize_pileup_base(b'a'), Some('A')); + assert_eq!(normalize_pileup_base(b'n'), None); + let pileup = SnpPileupCounts { + filtered_depth: 2, + filtered_ref_count: 1, + filtered_alt_count: 1, + raw_depth: 3, + raw_ref_count: 2, + raw_alt_count: 1, + filtered_low_base_quality: 1, + filtered_non_acgt: 1, + ..SnpPileupCounts::default() + }; + let evidence = pileup.evidence_lines("1:10-10", 10); + assert_eq!(evidence.len(), 4); + assert!(evidence[0].contains("filtered_depth=2")); + } + + #[test] + fn genotype_private_helpers_cover_file_and_zip_scanning_paths() { + let dir = temp_dir("file-zip-scanning"); + let text = dir.join("sample.txt"); + fs::write( + &text, + "# rsid chromosome position genotype\n\ + rs1 1 10 AG\n\ + rs2 2 20 CT\n", + ) + .unwrap(); + assert!(matches!( + detect_source_format(&text, None).unwrap(), + GenotypeSourceFormat::Text + )); + assert!(matches!( + detect_source_format(&text, Some(GenotypeSourceFormat::Cram)).unwrap(), + GenotypeSourceFormat::Cram + )); + assert!(!looks_like_vcf_lines(&["rsid\tgenotype".to_owned()])); + + let backend = DelimitedBackend { + format: GenotypeSourceFormat::Text, + path: text.clone(), + zip_entry_name: None, + }; + let variants = vec![ + VariantSpec { + rsids: vec!["rs2".to_owned()], + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("1", 10, 10)), + ..VariantSpec::default() + }, + VariantSpec { + rsids: vec!["missing".to_owned()], + ..VariantSpec::default() + }, + ]; + let results = scan_delimited_variants(&backend, &variants).unwrap(); + assert_eq!(results[0].genotype.as_deref(), Some("CT")); + assert_eq!(results[1].genotype.as_deref(), Some("AG")); + assert!(results[2].evidence[0].contains("no matching rsid")); + assert_eq!(backend.get("rs1").unwrap().as_deref(), Some("AG")); + assert_eq!( + backend + .lookup_variant(&VariantSpec { + rsids: vec!["rs2".to_owned()], + ..VariantSpec::default() + }) + .unwrap() + .genotype + .as_deref(), + Some("CT") + ); + + let zip_path = dir.join("sample.zip"); + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .add_directory("nested/", SimpleFileOptions::default()) + .unwrap(); + writer + .start_file("nested/sample.csv", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all(b"rsid,chromosome,position,genotype\nrs3,3,30,GG\n") + .unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + fs::write(&zip_path, bytes).unwrap(); + assert_eq!(select_zip_entry(&zip_path).unwrap(), "nested/sample.csv"); + let zip_backend = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(zip_backend.get("rs3").unwrap().as_deref(), Some("GG")); + + let unsupported_backend = DelimitedBackend { + format: GenotypeSourceFormat::Vcf, + path: text, + zip_entry_name: None, + }; + let err = scan_delimited_variants(&unsupported_backend, &variants).unwrap_err(); + assert!( + err.to_string() + .contains("streaming delimited backend only supports") + ); + } + + #[test] + fn zip_entry_limited_reader_rejects_oversized_output() { + let mut reader = std::io::Cursor::new(b"abcdef".to_vec()); + let err = read_zip_entry_limited(&mut reader, 5, "test zip entry").unwrap_err(); + assert!( + err.to_string() + .contains("test zip entry exceeds decompressed limit of 5 bytes"), + "{err}" + ); + } +} diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index 42739aa..93dc0a0 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -37,6 +37,8 @@ use bioscript_core::{Assembly, RuntimeError}; use noodles::bgzf; use zip::ZipArchive; +const MAX_ZIP_SAMPLE_ENTRY_BYTES: u64 = 128 * 1024 * 1024; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum FileContainer { Plain, @@ -277,12 +279,11 @@ fn read_zip_sample_lines_from_bytes( )) })?; if selected_entry.to_ascii_lowercase().ends_with(".vcf.gz") { - let mut inner = Vec::new(); - entry.read_to_end(&mut inner).map_err(|err| { - RuntimeError::Io(format!( - "failed to read compressed zip entry {selected_entry}: {err}" - )) - })?; + let inner = read_entry_limited( + &mut entry, + MAX_ZIP_SAMPLE_ENTRY_BYTES, + &format!("compressed zip entry {selected_entry}"), + )?; let reader = bgzf::io::Reader::new(Cursor::new(inner)); return read_sample_lines_from_reader(BufReader::new(reader)); } @@ -477,13 +478,14 @@ fn read_zip_sample_lines(path: &Path, selected_entry: &str) -> Result Result( + reader: &mut R, + max_bytes: u64, + label: &str, +) -> Result, RuntimeError> { + let mut bytes = Vec::new(); + reader + .take(max_bytes.saturating_add(1)) + .read_to_end(&mut bytes) + .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; + if u64::try_from(bytes.len()).unwrap_or(u64::MAX) > max_bytes { + return Err(RuntimeError::InvalidArguments(format!( + "{label} exceeds decompressed limit of {max_bytes} bytes" + ))); + } + Ok(bytes) +} + fn read_sample_lines_from_reader(mut reader: R) -> Result, RuntimeError> { let mut out = Vec::new(); let mut buf = String::new(); @@ -939,3 +959,107 @@ fn render_bool(value: Option) -> &'static str { None => "", } } + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn inspect_zip_entry_limited_reader_rejects_oversized_output() { + let mut reader = Cursor::new(b"abcdef".to_vec()); + let err = read_entry_limited(&mut reader, 5, "inspect zip entry").unwrap_err(); + assert!( + err.to_string() + .contains("inspect zip entry exceeds decompressed limit of 5 bytes"), + "{err}" + ); + } + + #[test] + fn inspect_helpers_cover_text_shape_source_and_assembly_edges() { + assert_eq!(split_fields("rs1\t1\t2\tAA"), vec!["rs1", "1", "2", "AA"]); + assert_eq!(split_fields("\"rs1\", 1, 2, \"AG\""), vec!["rs1", "1", "2", "AG"]); + assert!(looks_like_genotype_text(&[ + "// header".to_owned(), + "i12345 XY 10 A G".to_owned(), + "rs2 chr26 20 DD".to_owned(), + ])); + assert!(!looks_like_genotype_text(&["not enough fields".to_owned()])); + assert!(!matches_genotype_shape(&["bad".to_owned(), "1".to_owned(), "2".to_owned(), "AA".to_owned()])); + assert!(!matches_genotype_shape(&["rs1".to_owned(), "badchr".to_owned(), "2".to_owned(), "AA".to_owned()])); + assert!(!is_valid_genotype("")); + assert!(!is_valid_genotype("ACGTI")); + assert!(!is_valid_allele("N")); + + let gfg = detect_source( + "genesforgood.txt", + &["# Genes for Good v1 export".to_owned()], + DetectedKind::GenotypeText, + ) + .unwrap(); + assert_eq!(gfg.vendor.as_deref(), Some("Genes for Good")); + assert_eq!(gfg.platform_version.as_deref(), Some("v1")); + + let twenty_three = detect_source("/tmp/v5/23andme.txt", &[], DetectedKind::GenotypeText).unwrap(); + assert_eq!(twenty_three.vendor.as_deref(), Some("23andMe")); + assert_eq!(twenty_three.platform_version.as_deref(), Some("v5")); + assert_eq!( + detect_source("sequencing.com.vcf", &["##source=sequencing.com".to_owned()], DetectedKind::Vcf) + .unwrap() + .confidence, + DetectionConfidence::WeakHeuristic + ); + assert_eq!( + detect_source("cari-genetics.txt", &[], DetectedKind::GenotypeText) + .unwrap() + .vendor + .as_deref(), + Some("CariGenetics") + ); + assert_eq!(canonicalize_ancestry_version("v2.0"), "V2.0"); + + assert_eq!( + detect_assembly("sample", &["##reference=human_g1k_v37".to_owned()]), + Some(Assembly::Grch37) + ); + assert_eq!( + detect_assembly("sample", &["##contig=".to_owned()]), + Some(Assembly::Grch38) + ); + assert_eq!(detect_assembly("sample", &[]), None); + } + + #[test] + fn inspect_helpers_cover_index_and_render_edges() { + let explicit = PathBuf::from("/tmp/explicit.idx"); + let options = InspectOptions { + input_index: Some(explicit.clone()), + ..InspectOptions::default() + }; + assert_eq!( + detect_index(Path::new("sample.txt"), DetectedKind::GenotypeText, &options), + (Some(false), Some(explicit)) + ); + + let no_ext_ref = Path::new("reference"); + assert_eq!( + detect_index(no_ext_ref, DetectedKind::ReferenceFasta, &InspectOptions::default()).1, + Some(PathBuf::from("reference.fai")) + ); + assert_eq!( + detect_index(Path::new("sample.dat"), DetectedKind::AlignmentCram, &InspectOptions::default()), + (Some(false), None) + ); + + assert_eq!(render_container(FileContainer::Plain), "plain"); + assert_eq!(render_container(FileContainer::Zip), "zip"); + assert_eq!(render_kind(DetectedKind::AlignmentBam), "alignment_bam"); + assert_eq!(render_kind(DetectedKind::Unknown), "unknown"); + assert_eq!(render_confidence(DetectionConfidence::Unknown), "unknown"); + assert_eq!(render_assembly(None), ""); + assert_eq!(render_bool(Some(true)), "true"); + assert_eq!(render_bool(Some(false)), "false"); + assert_eq!(render_bool(None), ""); + } +} diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index 30949fe..ae050f8 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -2,6 +2,7 @@ clippy::case_sensitive_file_extension_comparisons, clippy::missing_errors_doc, clippy::must_use_candidate, + clippy::too_many_arguments, clippy::too_many_lines, clippy::unnecessary_wraps, clippy::unused_self diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index 05d8702..be6c90f 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -629,6 +629,7 @@ fn forced_cram_store(dir: &std::path::Path, reference_name: &str) -> GenotypeSto reference_file: Some(dir.join(reference_name)), reference_index: Some(dir.join(format!("{reference_name}.fai"))), input_index: Some(dir.join("missing.cram.crai")), + allow_reference_md5_mismatch: false, }, ) .unwrap() @@ -1212,6 +1213,13 @@ fn chr_y_cram_fixture_or_skip(test_name: &str) -> Option { } fn open_cram_store(fx: &CramFixture) -> GenotypeStore { + open_cram_store_with_md5_policy(fx, false) +} + +fn open_cram_store_with_md5_policy( + fx: &CramFixture, + allow_reference_md5_mismatch: bool, +) -> GenotypeStore { GenotypeStore::from_file_with_options( &fx.cram, &GenotypeLoadOptions { @@ -1219,6 +1227,7 @@ fn open_cram_store(fx: &CramFixture) -> GenotypeStore { input_index: Some(fx.input_index.clone()), reference_file: Some(fx.reference.clone()), reference_index: Some(fx.reference_index.clone()), + allow_reference_md5_mismatch, }, ) .expect("open cram store") @@ -1293,13 +1302,13 @@ fn cram_mini_fixture_streams_only_locus_overlapping_reads() { } #[test] -fn cram_mini_fixture_md5_mismatch_is_tolerated() { +fn cram_mini_fixture_md5_mismatch_is_tolerated_when_allowed() { // mini_bad_ref.fa has a single-base mutation at chr_test:2800, inside the // slice span but far from our query locus at 1000. noodles' strict MD5 // check will fail; bioscript must warn + retry unchecked + still return // the correct genotype (the bases at pos 1000 are identical in both refs). let fx = mini_cram_fixture_with_bad_ref(); - let store = open_cram_store(&fx); + let store = open_cram_store_with_md5_policy(&fx, true); let observation = store .lookup_variant(&VariantSpec { @@ -1432,7 +1441,7 @@ fn cram_md5_mismatch_is_tolerated_and_returns_correct_result() { else { return; }; - let store = open_cram_store(&fx); + let store = open_cram_store_with_md5_policy(&fx, true); let observation = store .lookup_variant(&VariantSpec { @@ -1483,7 +1492,7 @@ fn cram_rs9357296_reports_heterozygous_counts_for_na06985() { else { return; }; - let store = open_cram_store(&fx); + let store = open_cram_store_with_md5_policy(&fx, true); let observation = store .lookup_variant(&VariantSpec { diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index 667e67c..2c33a43 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -1,6 +1,7 @@ use std::{ collections::{BTreeMap, HashMap}, fs, + io::Read, path::{Component, Path, PathBuf}, sync::{ Arc, Mutex, @@ -22,6 +23,8 @@ type HostFunction = fn( &[(MontyObject, MontyObject)], ) -> Result; +const MAX_HOST_TEXT_BYTES: u64 = 16 * 1024 * 1024; + #[derive(Debug, Clone)] pub struct RuntimeConfig { pub limits: ResourceLimits, @@ -299,8 +302,11 @@ impl BioscriptRuntime { "bioscript.load_genotypes expects self and path".to_owned(), )); } - let path = - self.resolve_user_path(&expect_string_arg(args, 1, "bioscript.load_genotypes")?)?; + let path = self.resolve_existing_user_path(&expect_string_arg( + args, + 1, + "bioscript.load_genotypes", + )?)?; let loader = self.resolved_loader_options()?; let store = GenotypeStore::from_file_with_options(&path, &loader)?; let handle = self.state.next_handle(); @@ -543,7 +549,8 @@ impl BioscriptRuntime { "bioscript.write_tsv expects self, path, rows".to_owned(), )); } - let path = self.resolve_user_path(&expect_string_arg(args, 1, "bioscript.write_tsv")?)?; + let path = + self.resolve_user_write_path(&expect_string_arg(args, 1, "bioscript.write_tsv")?)?; let rows = expect_rows(&args[2])?; if let Some(parent) = path.parent() { fs::create_dir_all(parent).map_err(|err| { @@ -636,6 +643,47 @@ impl BioscriptRuntime { Ok(self.root.join(path)) } + fn resolve_existing_user_path(&self, raw_path: &str) -> Result { + let path = self.resolve_user_path(raw_path)?; + let canonical = path.canonicalize().map_err(|err| { + RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) + })?; + self.ensure_under_root(&canonical, raw_path)?; + Ok(canonical) + } + + fn resolve_user_write_path(&self, raw_path: &str) -> Result { + let path = self.resolve_user_path(raw_path)?; + if path.exists() { + let canonical = path.canonicalize().map_err(|err| { + RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) + })?; + self.ensure_under_root(&canonical, raw_path)?; + return Ok(canonical); + } + + let parent = path.parent().unwrap_or(&self.root); + let existing_parent = deepest_existing_ancestor(parent); + let canonical_parent = existing_parent.canonicalize().map_err(|err| { + RuntimeError::Io(format!( + "failed to resolve parent dir {}: {err}", + existing_parent.display() + )) + })?; + self.ensure_under_root(&canonical_parent, raw_path)?; + Ok(path) + } + + fn ensure_under_root(&self, path: &Path, raw_path: &str) -> Result<(), RuntimeError> { + if path.starts_with(&self.root) { + Ok(()) + } else { + Err(RuntimeError::InvalidArguments(format!( + "path escapes bioscript root: {raw_path}" + ))) + } + } + fn write_trace_report( &self, report_path: &Path, @@ -1310,9 +1358,8 @@ fn host_read_text( kwargs: &[(MontyObject, MontyObject)], ) -> Result { reject_kwargs(kwargs, "read_text")?; - let path = runtime.resolve_user_path(&expect_string_arg(args, 0, "read_text")?)?; - let content = fs::read_to_string(&path) - .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; + let path = runtime.resolve_existing_user_path(&expect_string_arg(args, 0, "read_text")?)?; + let content = read_text_limited(&path, MAX_HOST_TEXT_BYTES)?; Ok(MontyObject::String(content)) } @@ -1322,8 +1369,13 @@ fn host_write_text( kwargs: &[(MontyObject, MontyObject)], ) -> Result { reject_kwargs(kwargs, "write_text")?; - let path = runtime.resolve_user_path(&expect_string_arg(args, 0, "write_text")?)?; + let path = runtime.resolve_user_write_path(&expect_string_arg(args, 0, "write_text")?)?; let content = expect_string_arg(args, 1, "write_text")?; + if u64::try_from(content.len()).unwrap_or(u64::MAX) > MAX_HOST_TEXT_BYTES { + return Err(RuntimeError::InvalidArguments(format!( + "write_text content exceeds {MAX_HOST_TEXT_BYTES} bytes" + ))); + } if let Some(parent) = path.parent() { fs::create_dir_all(parent).map_err(|err| { RuntimeError::Io(format!( @@ -1337,6 +1389,40 @@ fn host_write_text( Ok(MontyObject::None) } +fn deepest_existing_ancestor(path: &Path) -> &Path { + let mut current = path; + while !current.exists() { + let Some(parent) = current.parent() else { + break; + }; + current = parent; + } + current +} + +fn read_text_limited(path: &Path, max_bytes: u64) -> Result { + let mut file = fs::File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; + let mut bytes = Vec::new(); + file.by_ref() + .take(max_bytes.saturating_add(1)) + .read_to_end(&mut bytes) + .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; + if u64::try_from(bytes.len()).unwrap_or(u64::MAX) > max_bytes { + return Err(RuntimeError::InvalidArguments(format!( + "read_text input {} exceeds {} bytes", + path.display(), + max_bytes + ))); + } + String::from_utf8(bytes).map_err(|err| { + RuntimeError::Io(format!( + "failed to decode {} as UTF-8: {err}", + path.display() + )) + }) +} + fn host_trace( runtime: &BioscriptRuntime, args: &[MontyObject], @@ -1451,3 +1537,118 @@ fn update_nesting_depth(mut depth: usize, line: &str) -> usize { depth } + +#[cfg(test)] +mod tests { + use super::*; + + fn attr<'a>(obj: &'a MontyObject, name: &str) -> Option<&'a MontyObject> { + let MontyObject::Dataclass { attrs, .. } = obj else { + return None; + }; + attrs.into_iter().find_map(|(key, value)| { + matches!(key, MontyObject::String(text) if text == name).then_some(value) + }) + } + + #[test] + fn trace_helpers_cover_coordinates_rsids_and_statement_edges() { + assert_eq!( + trace_lookup_metadata("bioscript.variant(rsid='rs12345')").0, + Some("rs12345".to_owned()) + ); + let (key, url) = trace_lookup_metadata("bioscript.variant(grch37='chr1:10-20')"); + assert_eq!(key.as_deref(), Some("1:10-20")); + assert!(url.unwrap().starts_with("https://grch37.ensembl.org")); + let (key, _) = trace_lookup_metadata("bioscript.variant(grch38='2:30')"); + assert_eq!(key.as_deref(), Some("2:30-30")); + assert_eq!(trace_lookup_metadata("no lookup here"), (None, None)); + + let lines = ["plan = bioscript.query_plan([", " RS1,", "])"]; + assert_eq!( + statement_context(&lines, 1), + "plan = bioscript.query_plan([ RS1, ])" + ); + assert_eq!(statement_context(&lines, 0), ""); + assert_eq!(statement_context(&lines, 9), ""); + + assert_eq!(extract_rsid("x rs42 y"), Some("rs42".to_owned())); + assert_eq!(extract_rsid("notrs42"), None); + assert_eq!(extract_coordinate("chrX:7;"), Some("X:7-7".to_owned())); + assert_eq!(extract_coordinate("chrM:7-8"), Some("M:7-8".to_owned())); + assert_eq!(extract_coordinate("chr1:x-y"), None); + } + + #[test] + fn instrument_source_tracks_continuations_comments_and_strings() { + let source = "value = (\n 'not ) counted'\n)\n# skip\nnext_value = 1\\\n + 2\n"; + let instrumented = instrument_source(source); + + assert!(instrumented.contains("__bioscript_trace__(1)\nvalue = (")); + assert!(!instrumented.contains("__bioscript_trace__(2)")); + assert!(!instrumented.contains("__bioscript_trace__(3)")); + assert!(!instrumented.contains("__bioscript_trace__(4)")); + assert!(instrumented.contains("__bioscript_trace__(5)\nnext_value = 1\\")); + assert!(!instrumented.contains("__bioscript_trace__(6)")); + assert!(instrumented.ends_with('\n')); + + assert!(ends_with_unescaped_backslash("x = 1\\")); + assert!(!ends_with_unescaped_backslash("x = '\\\\'")); + assert_eq!(update_nesting_depth(0, "call(') still string', [1]) # ignored"), 0); + assert_eq!(update_nesting_depth(0, "call(["), 2); + assert_eq!(update_nesting_depth(2, "])"), 0); + } + + #[test] + fn object_helpers_cover_optional_fields_and_conversion_errors() { + let observation = bioscript_core::VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: Some("rs1".to_owned()), + assembly: Some(bioscript_core::Assembly::Grch37), + genotype: Some("AG".to_owned()), + ref_count: Some(3), + alt_count: Some(2), + depth: Some(5), + raw_counts: BTreeMap::from([("A".to_owned(), 3), ("G".to_owned(), 2)]), + decision: Some("heterozygous".to_owned()), + evidence: vec!["resolved".to_owned()], + }; + let object = variant_observation_object(&observation); + assert!(matches!(attr(&object, "assembly"), Some(MontyObject::String(v)) if v == "grch37")); + assert!(matches!(attr(&object, "ref_count"), Some(MontyObject::Int(3)))); + assert!(matches!(attr(&object, "alt_count"), Some(MontyObject::Int(2)))); + assert!(matches!(attr(&object, "depth"), Some(MontyObject::Int(5)))); + + let missing = variant_observation_object(&bioscript_core::VariantObservation::default()); + assert!(matches!(attr(&missing, "assembly"), Some(MontyObject::None))); + assert!(matches!(attr(&missing, "genotype"), Some(MontyObject::None))); + + assert_eq!(string_or_list(&MontyObject::None).unwrap(), Vec::::new()); + assert_eq!( + string_list_from_object(&MontyObject::None).unwrap(), + Vec::::new() + ); + assert_eq!(string_from_optional(&MontyObject::None).unwrap(), None); + assert_eq!(int_from_optional(&MontyObject::None).unwrap(), None); + assert!(string_list_from_object(&MontyObject::String("x".to_owned())).is_err()); + + let bad_plan = MontyObject::Dataclass { + name: "VariantPlan".to_owned(), + type_id: 4, + field_names: vec![], + attrs: vec![].into(), + frozen: true, + }; + assert!(variant_specs_from_plan(&bad_plan).unwrap_err().to_string().contains("missing variants")); + + let bad_variant = MontyObject::Dataclass { + name: "Other".to_owned(), + type_id: 9, + field_names: vec![], + attrs: vec![].into(), + frozen: true, + }; + assert!(dataclass_to_variant_spec(&bad_variant).unwrap_err().to_string().contains("got Other")); + assert!(dataclass_to_variant_spec(&MontyObject::None).unwrap_err().to_string().contains("expected Variant object")); + } +} diff --git a/rust/bioscript-runtime/tests/security.rs b/rust/bioscript-runtime/tests/security.rs index b1ba2f3..afbaecd 100644 --- a/rust/bioscript-runtime/tests/security.rs +++ b/rust/bioscript-runtime/tests/security.rs @@ -1,12 +1,12 @@ use std::{ fs, path::PathBuf, - time::{SystemTime, UNIX_EPOCH}, + time::{Duration, SystemTime, UNIX_EPOCH}, }; use bioscript_formats::GenotypeLoadOptions; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; -use monty::MontyObject; +use monty::{MontyObject, ResourceLimits}; fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() @@ -131,6 +131,142 @@ if __name__ == "__main__": assert_eq!(written, "hello nested output"); } +#[cfg(unix)] +#[test] +fn host_read_text_rejects_symlink_escape() { + let dir = temp_dir("read-symlink-escape"); + let outside = temp_dir("read-symlink-outside"); + fs::write(outside.join("secret.txt"), "secret").unwrap(); + std::os::unix::fs::symlink(outside.join("secret.txt"), dir.join("linked-secret.txt")).unwrap(); + + let Err(err) = run_script_with_inputs( + &dir, + r#" +def main(): + bioscript.read_text("linked-secret.txt") + +if __name__ == "__main__": + main() +"#, + Vec::new(), + ) else { + panic!("expected symlink read to fail"); + }; + assert!(err.contains("path escapes bioscript root"), "{err}"); +} + +#[cfg(unix)] +#[test] +fn host_write_text_rejects_symlink_escape() { + let dir = temp_dir("write-symlink-escape"); + let outside = temp_dir("write-symlink-outside"); + fs::write(outside.join("target.txt"), "before").unwrap(); + std::os::unix::fs::symlink(outside.join("target.txt"), dir.join("linked-target.txt")).unwrap(); + + let Err(err) = run_script_with_inputs( + &dir, + r#" +def main(): + bioscript.write_text("linked-target.txt", "after") + +if __name__ == "__main__": + main() +"#, + Vec::new(), + ) else { + panic!("expected symlink write to fail"); + }; + assert!(err.contains("path escapes bioscript root"), "{err}"); + assert_eq!( + fs::read_to_string(outside.join("target.txt")).unwrap(), + "before" + ); +} + +#[test] +fn host_read_text_rejects_oversized_file() { + let dir = temp_dir("oversized-read"); + fs::write(dir.join("large.txt"), vec![b'a'; 16 * 1024 * 1024 + 1]).unwrap(); + + let Err(err) = run_script_with_inputs( + &dir, + r#" +def main(): + bioscript.read_text("large.txt") + +if __name__ == "__main__": + main() +"#, + Vec::new(), + ) else { + panic!("expected oversized read to fail"); + }; + assert!(err.contains("exceeds 16777216 bytes"), "{err}"); +} + +#[test] +fn host_read_text_rejects_invalid_utf8() { + let dir = temp_dir("invalid-utf8-read"); + fs::write(dir.join("invalid.txt"), [0xff, 0xfe, b'a']).unwrap(); + + let Err(err) = run_script_with_inputs( + &dir, + r#" +def main(): + bioscript.read_text("invalid.txt") + +if __name__ == "__main__": + main() +"#, + Vec::new(), + ) else { + panic!("expected invalid UTF-8 read to fail"); + }; + assert!(err.contains("failed to decode"), "{err}"); +} + +#[test] +fn host_write_text_rejects_oversized_content() { + let dir = temp_dir("oversized-write"); + let script = dir.join("script.py"); + fs::write( + &script, + r#" +def main(): + bioscript.write_text("large.txt", content) + +if __name__ == "__main__": + main() +"#, + ) + .unwrap(); + let huge = "a".repeat(16 * 1024 * 1024 + 1); + let runtime = BioscriptRuntime::with_config( + &dir, + RuntimeConfig { + limits: ResourceLimits::new() + .max_duration(Duration::from_millis(100)) + .max_memory(64 * 1024 * 1024) + .max_allocations(200_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)), + ..RuntimeConfig::default() + }, + ) + .unwrap(); + + let Err(err) = runtime.run_file(&script, None, vec![("content", MontyObject::String(huge))]) + else { + panic!("expected oversized write to fail"); + }; + let err = err.to_string(); + assert!( + err.contains("write_text content exceeds 16777216 bytes"), + "{err}" + ); + assert!(!dir.join("large.txt").exists()); +} + #[test] fn runtime_lookup_details_reports_missing_variant_and_no_call() { let dir = temp_dir("lookup-missing-no-call"); @@ -572,6 +708,7 @@ fn runtime_loader_paths_are_resolved_and_escape_checks_apply() { input_index: Some(PathBuf::from("indexes/input.crai")), reference_file: Some(PathBuf::from("refs/ref.fa")), reference_index: Some(PathBuf::from("refs/ref.fa.fai")), + ..GenotypeLoadOptions::default() }, ..RuntimeConfig::default() }, diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index f6530f0..f645546 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -780,3 +780,221 @@ fn remote_resource_resolution_handles_json_versions_and_plain_relative_urls() { "{err}" ); } + +#[test] +fn validate_variants_covers_remaining_identity_coordinate_and_allele_edges() { + let dir = temp_dir("validate-variant-more-edges"); + fs::write( + dir.join("not-a-variant.yaml"), + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "panel-shape" +members: [] +"#, + ) + .unwrap(); + fs::write( + dir.join("many-errors.yaml"), + r#" +schema: "bioscript:variant:1.0" +tags: + - 7 + - "" +identifiers: + aliases: + - 7 + - "bad-alias" + - "rs22" + - "rs22" +coordinates: + grch37: + pos: 12 + grch38: + chrom: "2" +alleles: + kind: "snv" +provenance: + sources: + - kind: "database" +"#, + ) + .unwrap(); + fs::write( + dir.join("range-and-alleles.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "range-and-alleles" +coordinates: + grch37: + chrom: "3" + start: 0 + end: 0 + grch38: + chrom: "4" + start: "bad" + end: 9 +alleles: + kind: "deletion" + ref: "A" + alts: "T" +"#, + ) + .unwrap(); + fs::write( + dir.join("empty-alts.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "empty-alts" +coordinates: + grch38: + chrom: "5" + pos: 11 +alleles: + kind: "insertion" + ref: "A" + alts: [] +"#, + ) + .unwrap(); + + let report = validate_variants_path(&dir).unwrap(); + let text = report.render_text(); + + assert_eq!(report.files_scanned, 4); + assert!(report.total_issues() >= 19, "{text}"); + for expected in [ + "missing required field", + "tags[0]: expected string", + "tags[1]: empty tag string", + "identifiers.aliases[0]: expected string", + "expected rsid like rs123, found 'bad-alias'", + "duplicate identifier 'rs22'", + "coordinates.grch37.chrom: missing chrom", + "coordinates.grch38: expected either pos or start/end", + "coordinates.grch37.start: expected integer >= 1", + "coordinates.grch37.end: expected integer >= 1", + "coordinates.grch38: expected integer start/end", + "alleles.ref: missing required field", + "alleles.alts: expected a non-empty sequence of strings", + "alleles.alts: expected at least one alternate allele", + "provenance.sources[0].label: missing required field", + "provenance.sources[0].url: missing required field", + ] { + assert!(text.contains(expected), "{expected}\n{text}"); + } + assert!(!text.contains("panel-shape")); +} + +#[test] +fn validate_panels_and_loaders_cover_parse_error_edges() { + let dir = temp_dir("validate-panel-more-edges"); + let non_panel = dir.join("variant.yaml"); + fs::write( + &non_panel, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "rs1" +"#, + ) + .unwrap(); + let missing_schema = dir.join("missing-schema.yaml"); + fs::write( + &missing_schema, + r#" +version: "1.0" +name: "missing-schema" +"#, + ) + .unwrap(); + + let report = validate_panels_path(&dir).unwrap(); + let text = report.render_text(); + assert_eq!(report.files_scanned, 2); + assert_eq!(report.total_errors(), 1, "{text}"); + assert!(text.contains("missing schema")); + assert!(!text.contains("rs1")); + + let invalid_panel = dir.join("invalid-panel.yaml"); + fs::write( + &invalid_panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +downloads: + - id: "" + url: "http://" + sha256: "" +members: + - download: "" + - "not-a-map" +"#, + ) + .unwrap(); + let err = load_panel_manifest(&invalid_panel).unwrap_err(); + assert!(err.contains("name: missing required field"), "{err}"); + assert!(err.contains("downloads[0].id: empty string"), "{err}"); + assert!(err.contains("downloads[0].version: missing required field"), "{err}"); + assert!(err.contains("members[0].kind: missing required field"), "{err}"); + assert!(err.contains("members[0].download: empty string"), "{err}"); + assert!(err.contains("members[1]: expected mapping"), "{err}"); + + let downloads_not_mapping = dir.join("downloads-not-mapping.yaml"); + fs::write( + &downloads_not_mapping, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "bad-download" +downloads: + - "not-a-map" +members: + - kind: "variant" + path: "rs1.yaml" +"#, + ) + .unwrap(); + let err = load_panel_manifest(&downloads_not_mapping).unwrap_err(); + assert!(err.contains("downloads[0]: expected mapping"), "{err}"); + + let members_not_mapping = dir.join("members-not-mapping.yaml"); + fs::write( + &members_not_mapping, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "bad-member" +members: + - "not-a-map" +"#, + ) + .unwrap(); + let err = load_panel_manifest(&members_not_mapping).unwrap_err(); + assert!(err.contains("members[0]: expected mapping"), "{err}"); + + let invalid_lookup = load_variant_manifest_text_for_lookup( + "bad-lookup.yaml", + r#" +schema: "wrong" +version: "1.0" +name: "bad-lookup" +coordinates: + grch38: + chrom: "1" + pos: 0 +alleles: + kind: "snv" + ref: "A" + alts: ["G"] +"#, + ) + .unwrap_err(); + assert!(invalid_lookup.contains("schema: expected schema"), "{invalid_lookup}"); + assert!( + invalid_lookup.contains("coordinates.grch38.pos: expected integer >= 1"), + "{invalid_lookup}" + ); +} diff --git a/rust/bioscript-wasm/src/lib.rs b/rust/bioscript-wasm/src/lib.rs index 563c150..b2c5607 100644 --- a/rust/bioscript-wasm/src/lib.rs +++ b/rust/bioscript-wasm/src/lib.rs @@ -428,6 +428,27 @@ pub fn lookup_genotype_bytes_variants( serde_json::to_string(&rows).map_err(|err| JsError::new(&format!("encode results: {err}"))) } +#[wasm_bindgen(js_name = lookupGenotypeBytesRsids)] +pub fn lookup_genotype_bytes_rsids( + name: &str, + bytes: &[u8], + rsids_json: &str, +) -> Result { + let store = GenotypeStore::from_bytes(name, bytes) + .map_err(|err| JsError::new(&format!("load genotype bytes {name}: {err:?}")))?; + let rsids: Vec = serde_json::from_str(rsids_json) + .map_err(|err| JsError::new(&format!("parse rsidsJson: {err}")))?; + let values = rsids + .iter() + .map(|rsid| { + store + .get(rsid) + .map_err(|err| JsError::new(&format!("lookup genotype rsid {rsid}: {err:?}"))) + }) + .collect::, _>>()?; + serde_json::to_string(&values).map_err(|err| JsError::new(&format!("encode results: {err}"))) +} + fn ensure_single_base_variant(variant: &VariantInput) -> Result<(), JsError> { let kind = variant .kind From 5fb94859a1f0bee08fd1ae55b807e8d224038525 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 15:59:38 +1000 Subject: [PATCH 7/9] more coverage --- .github/workflows/coverage.yml | 2 +- coverage.sh | 6 +- rust/bioscript-cli/tests/cli.rs | 173 ++++++++ rust/bioscript-formats/src/alignment.rs | 65 ++- rust/bioscript-formats/src/genotype.rs | 395 ++++++++++++++++++ rust/bioscript-formats/src/inspect.rs | 56 ++- rust/bioscript-runtime/src/runtime.rs | 51 ++- .../tests/validate_variants.rs | 17 +- 8 files changed, 739 insertions(+), 26 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 61a5cee..826ccde 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - test: [file_formats, formats_lib, inspect, prepare, cli, schema, core, runtime_security, runtime_resources] + test: [file_formats, formats_lib, inspect, prepare, cli, schema, core, runtime_lib, runtime_security, runtime_resources] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/coverage.sh b/coverage.sh index 42046dc..89791f2 100755 --- a/coverage.sh +++ b/coverage.sh @@ -25,7 +25,7 @@ Usage: ./coverage.sh [--full-clean|-c] [--open] [--large] [--all-tests] [--no-li --no-lint Skip cargo fmt and clippy checks --focused-test Run one focused integration test target: file_formats, formats_lib, inspect, prepare, cli, schema, core, - runtime_security, or runtime_resources + runtime_lib, runtime_security, or runtime_resources Environment: AUTO_INSTALL_LLVM_COV=0 Do not auto-install cargo-llvm-cov @@ -178,6 +178,9 @@ if [[ -n "$FOCUSED_TEST" ]]; then core) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib ;; + runtime_lib) + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --lib + ;; runtime_security) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test security -- --nocapture --test-threads="$TEST_THREADS" ;; @@ -200,6 +203,7 @@ else env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --test cli -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-schema --test validate_variants -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --lib env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test security -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test resources_coverage -- --nocapture --test-threads="$TEST_THREADS" fi diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index e855032..927882f 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -88,15 +88,44 @@ fn cli_rejects_missing_values_and_unexpected_arguments() { vec!["inspect", "bioscripts/hello-world.py", "extra"], "unexpected argument: extra", ), + ( + vec!["inspect", "--input-index"], + "--input-index requires a path", + ), + ( + vec!["inspect", "--reference-file"], + "--reference-file requires a path", + ), + ( + vec!["inspect", "--reference-index"], + "--reference-index requires a path", + ), (vec!["prepare", "--root"], "--root requires a directory"), + ( + vec!["prepare", "--input-file"], + "--input-file requires a path", + ), + ( + vec!["prepare", "--reference-file"], + "--reference-file requires a path", + ), + (vec!["prepare", "extra"], "unexpected argument: extra"), ( vec!["prepare", "--cache-dir"], "--cache-dir requires a path", ), + ( + vec!["validate-variants", "one.yaml", "two.yaml"], + "unexpected argument: two.yaml", + ), ( vec!["validate-variants", "--report"], "--report requires a path", ), + ( + vec!["validate-panels", "one.yaml", "two.yaml"], + "unexpected argument: two.yaml", + ), ( vec!["validate-panels", "--report"], "--report requires a path", @@ -109,6 +138,51 @@ fn cli_rejects_missing_values_and_unexpected_arguments() { } } +#[test] +fn cli_accepts_auto_format_and_explicit_loader_paths_for_script_runs() { + let root = repo_root(); + let dir = temp_dir("loader-args"); + fs::write( + dir.join("script.py"), + r#" +def main(): + print("loader args accepted") + +if __name__ == "__main__": + main() +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--root") + .arg(&dir) + .arg("--input-format") + .arg("auto") + .arg("--input-index") + .arg("input.crai") + .arg("--reference-file") + .arg("ref.fa") + .arg("--reference-index") + .arg("ref.fa.fai") + .arg("--allow-md5-mismatch") + .arg(dir.join("script.py")) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + assert!( + String::from_utf8_lossy(&output.stdout).contains("loader args accepted"), + "stdout: {}", + String::from_utf8_lossy(&output.stdout) + ); +} + #[test] fn cli_rejects_invalid_numeric_limits_and_input_formats() { let root = repo_root(); @@ -433,6 +507,30 @@ fn prepare_subcommand_reports_reference_index_flags() { assert!(stdout.contains("cache")); } +#[test] +fn prepare_subcommand_reports_nothing_to_index_for_noop_auto_request() { + let root = repo_root(); + let dir = temp_dir("prepare-noop-cli"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("prepare") + .arg("--root") + .arg(&dir) + .arg("--input-format") + .arg("auto") + .output() + .unwrap(); + + assert!(output.status.success(), "stderr: {}", stderr_text(&output)); + assert!(String::from_utf8_lossy(&output.stdout).is_empty()); + assert!( + stderr_text(&output).contains("bioscript prepare: nothing to index"), + "{}", + stderr_text(&output) + ); +} + #[test] fn validate_variants_cli_returns_nonzero_and_writes_report() { let root = repo_root(); @@ -565,6 +663,46 @@ alleles: assert!(stdout.contains("AG")); } +#[test] +fn variant_manifest_requires_input_file() { + let root = repo_root(); + let dir = temp_dir("variant-manifest-missing-input"); + let manifest = dir.join("rs1.yaml"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs1" +identifiers: + rsids: + - "rs1" +coordinates: + grch38: + chrom: "1" + pos: 10 +alleles: + kind: "snv" + ref: "A" + alts: ["G"] +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg(&manifest) + .output() + .unwrap(); + + assert!(!output.status.success()); + assert!( + stderr_text(&output).contains("manifest execution requires --input-file"), + "{}", + stderr_text(&output) + ); +} + #[test] fn variant_manifest_writes_output_trace_and_participant_id() { let root = repo_root(); @@ -844,3 +982,38 @@ members: "{stderr}" ); } + +#[test] +fn panel_manifest_reports_non_variant_members_as_not_executable_yet() { + let root = repo_root(); + let dir = temp_dir("panel-nonvariant-member"); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "mixed-panel" +members: + - kind: "script" + path: "script.py" + version: "1.0" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&panel) + .output() + .unwrap(); + + assert!(!output.status.success()); + assert!( + stderr_text(&output).contains("unsupported member kind 'script'"), + "{}", + stderr_text(&output) + ); +} diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index 2b4901f..7185b04 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -727,7 +727,7 @@ fn map_op(op: sam::alignment::record::cigar::Op) -> AlignmentOp { #[cfg(test)] mod tests { use super::*; - use std::num::NonZero; + use std::{fs::File, num::NonZero, path::PathBuf}; use noodles::sam::{ self, @@ -756,6 +756,10 @@ mod tests { .build() } + fn mini_fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures") + } + #[test] fn alignment_helpers_cover_header_region_and_interval_logic() { let header = header(); @@ -916,4 +920,63 @@ mod tests { let err = std::io::Error::other("other decode error"); assert!(!is_reference_md5_mismatch(&err)); } + + #[test] + fn alignment_path_and_reader_wrappers_stream_mini_cram_records() { + let dir = mini_fixtures_dir(); + let cram = dir.join("mini.cram"); + let cram_index = dir.join("mini.cram.crai"); + let reference = dir.join("mini.fa"); + let target = locus("chr_test", 1000, 1000); + let options = GenotypeLoadOptions { + input_index: Some(cram_index.clone()), + ..GenotypeLoadOptions::default() + }; + + let mut path_seen = 0; + for_each_cram_record(&cram, &options, &reference, &target, |record| { + path_seen += 1; + assert!(!record.is_unmapped); + assert!(record.start <= 1000); + assert!(record.end >= 1000); + Ok(path_seen < 3) + }) + .unwrap(); + assert_eq!(path_seen, 3); + + let records = query_cram_records(&cram, &options, &reference, &target).unwrap(); + assert_eq!(records.len(), 50); + + let missing = locus("missing", 1, 1); + let err = query_cram_records(&cram, &options, &reference, &missing).unwrap_err(); + assert!(err.to_string().contains("does not contain contig missing")); + + let repository = build_reference_repository(&reference).unwrap(); + let index = parse_crai_bytes(&std::fs::read(cram_index).unwrap()).unwrap(); + let mut reader = + build_cram_indexed_reader_from_reader(File::open(cram).unwrap(), index, repository) + .unwrap(); + + let mut reader_seen = 0; + for_each_cram_record_with_reader(&mut reader, "mini.cram", &target, |_| { + reader_seen += 1; + Ok(reader_seen < 2) + }) + .unwrap(); + assert_eq!(reader_seen, 2); + + let err = for_each_cram_record_with_reader(&mut reader, "mini.cram", &target, |_| { + Err(RuntimeError::Unsupported("callback failed".to_owned())) + }) + .unwrap_err(); + assert!(err.to_string().contains("callback failed")); + + let mut raw_seen = 0; + for_each_raw_cram_record_with_reader(&mut reader, "mini.cram", &target, |_| { + raw_seen += 1; + Ok(raw_seen < 2) + }) + .unwrap(); + assert_eq!(raw_seen, 2); + } } diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index ae2091a..f0548cc 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -2640,6 +2640,10 @@ mod tests { } } + fn mini_fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures") + } + fn variant_with_loci() -> VariantSpec { VariantSpec { rsids: vec!["rs1".to_owned()], @@ -3088,6 +3092,397 @@ mod tests { ); } + #[test] + fn genotype_private_helpers_cover_vcf_file_zip_and_error_paths() { + let dir = temp_dir("vcf-file-zip-errors"); + let vcf_path = dir.join("sample.grch38.vcf"); + fs::write( + &vcf_path, + "##fileformat=VCFv4.3\n\ + ##reference=GRCh38\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + chr1\t10\trs10\tA\tG\t.\tPASS\t.\tGT:DP\t0/1:12\n\ + chr1\t19\trsDel\tAT\tA\t.\tPASS\t.\tGT\t1/1\n\ + chr2\t30\t.\tC\tT\t.\tPASS\t.\tDP:GT\t8:0/0\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&vcf_path).unwrap(); + assert_eq!(store.get("rs10").unwrap().as_deref(), Some("AG")); + let observations = store + .lookup_variants(&[ + VariantSpec { + grch38: Some(locus("1", 10, 10)), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("1", 20, 20)), + deletion_length: Some(1), + kind: Some(VariantKind::Deletion), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("2", 31, 31)), + kind: Some(VariantKind::Other), + ..VariantSpec::default() + }, + VariantSpec { + rsids: vec!["missing".to_owned()], + ..VariantSpec::default() + }, + ]) + .unwrap(); + assert_eq!(observations[0].genotype.as_deref(), Some("AG")); + assert_eq!(observations[1].genotype.as_deref(), Some("DD")); + assert_eq!(observations[2].genotype.as_deref(), None); + assert!(observations[3].evidence[0].contains("variant_by_rsid")); + assert_eq!(observations[0].assembly, Some(Assembly::Grch38)); + + let zip_path = dir.join("vcf.zip"); + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .start_file("nested/sample.vcf", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all(fs::read(&vcf_path).unwrap().as_slice()) + .unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + fs::write(&zip_path, bytes).unwrap(); + let zip_store = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(zip_store.get("rs10").unwrap().as_deref(), Some("AG")); + + let err = scan_vcf_variants( + &VcfBackend { + path: dir.join("missing.vcf"), + }, + &[VariantSpec::default()], + ) + .unwrap_err(); + assert!(err.to_string().contains("failed to open VCF file")); + + let bad_zip_backend = DelimitedBackend { + format: GenotypeSourceFormat::Zip, + path: zip_path.clone(), + zip_entry_name: None, + }; + let err = scan_delimited_variants(&bad_zip_backend, &[VariantSpec::default()]).unwrap_err(); + assert!( + err.to_string() + .contains("zip backend missing selected entry"), + "{err}" + ); + + let bad_entry_backend = DelimitedBackend { + format: GenotypeSourceFormat::Zip, + path: zip_path, + zip_entry_name: Some("missing.csv".to_owned()), + }; + let err = + scan_delimited_variants(&bad_entry_backend, &[VariantSpec::default()]).unwrap_err(); + assert!( + err.to_string().contains("failed to open genotype entry"), + "{err}" + ); + } + + #[test] + fn genotype_private_helpers_cover_cram_backend_paths_with_mini_fixture() { + let dir = mini_fixtures_dir(); + let cram = dir.join("mini.cram"); + let cram_index = dir.join("mini.cram.crai"); + let reference = dir.join("mini.fa"); + let options = GenotypeLoadOptions { + input_index: Some(cram_index.clone()), + reference_file: Some(reference.clone()), + ..GenotypeLoadOptions::default() + }; + let store = GenotypeStore::from_file_with_options(&cram, &options).unwrap(); + assert_eq!(store.backend_name(), "cram"); + assert!(store.supports(QueryKind::GenotypeByLocus)); + assert!(!store.supports(QueryKind::GenotypeByRsid)); + + let snp = VariantSpec { + rsids: vec!["mini_locus_1000".to_owned()], + grch38: Some(locus("chr_test", 1000, 1000)), + reference: Some("A".to_owned()), + alternate: Some("C".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }; + let observation = store.lookup_variant(&snp).unwrap(); + assert_eq!(observation.backend, "cram"); + assert_eq!(observation.matched_rsid.as_deref(), Some("mini_locus_1000")); + assert_eq!(observation.genotype.as_deref(), Some("AA")); + assert_eq!(observation.depth, Some(50)); + + let deletion = VariantSpec { + rsids: vec!["mini_del".to_owned()], + grch38: Some(locus("chr_test", 1000, 1000)), + reference: Some("I".to_owned()), + alternate: Some("D".to_owned()), + kind: Some(VariantKind::Deletion), + deletion_length: Some(1), + ..VariantSpec::default() + }; + let deletion_observation = store.lookup_variant(&deletion).unwrap(); + assert_eq!(deletion_observation.genotype.as_deref(), Some("II")); + assert_eq!(deletion_observation.ref_count, Some(50)); + assert_eq!(deletion_observation.alt_count, Some(0)); + + let indel = VariantSpec { + rsids: vec!["mini_indel".to_owned()], + grch38: Some(locus("chr_test", 1000, 1000)), + reference: Some("A".to_owned()), + alternate: Some("AT".to_owned()), + kind: Some(VariantKind::Indel), + ..VariantSpec::default() + }; + let indel_observation = store.lookup_variant(&indel).unwrap(); + assert_eq!(indel_observation.genotype.as_deref(), Some("AA")); + assert_eq!(indel_observation.ref_count, Some(50)); + assert_eq!(indel_observation.alt_count, Some(0)); + + let missing_reference = GenotypeStore::from_file_with_options( + &cram, + &GenotypeLoadOptions { + input_index: Some(cram_index.clone()), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + let err = missing_reference.lookup_variant(&snp).unwrap_err(); + assert!(err.to_string().contains("without --reference-file")); + + let err = store.get("rs-only").unwrap_err(); + assert!(err.to_string().contains("needs GRCh37/GRCh38 coordinates")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(locus("chr_test", 1000, 1000)), + kind: Some(VariantKind::Other), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(err.to_string().contains("does not yet support")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(locus("chr_test", 1000, 1000)), + kind: Some(VariantKind::Snp), + alternate: Some("C".to_owned()), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(err.to_string().contains("SNP variant requires ref")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(locus("chr_test", 1000, 1000)), + kind: Some(VariantKind::Snp), + reference: Some("A".to_owned()), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(err.to_string().contains("SNP variant requires alt")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(locus("chr_test", 1000, 1000)), + kind: Some(VariantKind::Deletion), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(err.to_string().contains("deletion_length")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(locus("chr_test", 1000, 1000)), + kind: Some(VariantKind::Indel), + alternate: Some("AT".to_owned()), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(err.to_string().contains("indel variant requires ref")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(locus("chr_test", 1000, 1000)), + kind: Some(VariantKind::Insertion), + reference: Some("A".to_owned()), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(err.to_string().contains("indel variant requires alt")); + } + + #[test] + fn genotype_public_cram_reader_snp_wrapper_uses_mini_fixture() { + let dir = mini_fixtures_dir(); + let cram = dir.join("mini.cram"); + let cram_index = dir.join("mini.cram.crai"); + let reference = dir.join("mini.fa"); + let repository = crate::alignment::build_reference_repository(&reference).unwrap(); + let index = crate::alignment::parse_crai_bytes(&fs::read(cram_index).unwrap()).unwrap(); + let mut reader = crate::alignment::build_cram_indexed_reader_from_reader( + fs::File::open(cram).unwrap(), + index, + repository, + ) + .unwrap(); + + let observation = observe_cram_snp_with_reader( + &mut reader, + "mini.cram", + &locus("chr_test", 1000, 1000), + 'A', + 'C', + Some("mini_locus_1000".to_owned()), + Some(Assembly::Grch38), + ) + .unwrap(); + + assert_eq!(observation.genotype.as_deref(), Some("AA")); + assert_eq!(observation.ref_count, Some(50)); + assert_eq!(observation.alt_count, Some(0)); + assert_eq!(observation.depth, Some(50)); + assert_eq!(observation.assembly, Some(Assembly::Grch38)); + } + + #[test] + fn genotype_public_cram_reader_indel_wrapper_uses_mini_fixture() { + let dir = mini_fixtures_dir(); + let cram = dir.join("mini.cram"); + let cram_index = dir.join("mini.cram.crai"); + let reference = dir.join("mini.fa"); + let repository = crate::alignment::build_reference_repository(&reference).unwrap(); + let index = crate::alignment::parse_crai_bytes(&fs::read(cram_index).unwrap()).unwrap(); + let mut reader = crate::alignment::build_cram_indexed_reader_from_reader( + fs::File::open(cram).unwrap(), + index, + repository, + ) + .unwrap(); + + let observation = observe_cram_indel_with_reader( + &mut reader, + "mini.cram", + &locus("chr_test", 1000, 1000), + "A", + "AT", + Some("mini_indel".to_owned()), + Some(Assembly::Grch38), + ) + .unwrap(); + + assert_eq!(observation.backend, "cram"); + assert_eq!(observation.matched_rsid.as_deref(), Some("mini_indel")); + assert_eq!(observation.assembly, Some(Assembly::Grch38)); + assert_eq!(observation.genotype.as_deref(), Some("AA")); + assert_eq!(observation.ref_count, Some(50)); + assert_eq!(observation.alt_count, Some(0)); + assert_eq!(observation.depth, Some(50)); + assert!(observation.evidence[0].contains("matching_alt_lengths=none")); + } + + #[test] + fn genotype_public_vcf_reader_wrapper_uses_tiny_tabix_fixture() { + use noodles::vcf; + + let dir = temp_dir("vcf-reader-wrapper"); + let vcf_path = dir.join("sample.vcf.gz"); + let mut writer = bgzf::io::Writer::new(fs::File::create(&vcf_path).unwrap()); + writer + .write_all( + b"##fileformat=VCFv4.3\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + chr1\t10\trs10\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ + chr1\t20\trs20\tC\tT\t.\tPASS\t.\tGT\t1/1\n\ + chr2\t30\trs30\tG\tA\t.\tPASS\t.\tGT\t0/0\n", + ) + .unwrap(); + writer.finish().unwrap(); + + let open_indexed = || { + let index = vcf::fs::index(&vcf_path).unwrap(); + csi::io::IndexedReader::new(fs::File::open(&vcf_path).unwrap(), index) + }; + + let mut indexed = open_indexed(); + let observation = observe_vcf_snp_with_reader( + &mut indexed, + "tiny.vcf.gz", + &locus("1", 10, 10), + 'A', + 'G', + None, + Some(Assembly::Grch38), + ) + .unwrap(); + assert_eq!(observation.backend, "vcf"); + assert_eq!(observation.matched_rsid.as_deref(), Some("rs10")); + assert_eq!(observation.genotype.as_deref(), Some("AG")); + assert_eq!(observation.assembly, Some(Assembly::Grch38)); + + let mut indexed = open_indexed(); + let observation = observe_vcf_snp_with_reader( + &mut indexed, + "tiny.vcf.gz", + &locus("chr1", 10, 10), + 'A', + 'T', + Some("requested".to_owned()), + None, + ) + .unwrap(); + assert_eq!(observation.matched_rsid.as_deref(), Some("requested")); + assert!(observation.evidence[0].contains("did not match")); + + let mut indexed = open_indexed(); + let observation = observe_vcf_snp_with_reader( + &mut indexed, + "tiny.vcf.gz", + &locus("1", 11, 11), + 'A', + 'G', + None, + None, + ) + .unwrap(); + assert!(observation.evidence[0].contains("no VCF record")); + + let mut indexed = open_indexed(); + let observation = observe_vcf_snp_with_reader( + &mut indexed, + "tiny.vcf.gz", + &locus("missing", 10, 10), + 'A', + 'G', + Some("missing-rsid".to_owned()), + Some(Assembly::Grch37), + ) + .unwrap(); + assert_eq!(observation.matched_rsid.as_deref(), Some("missing-rsid")); + assert_eq!(observation.assembly, Some(Assembly::Grch37)); + assert!(observation.evidence[0].contains("has no contig")); + + let mut indexed = open_indexed(); + let err = observe_vcf_snp_with_reader( + &mut indexed, + "tiny.vcf.gz", + &locus("1", -1, -1), + 'A', + 'G', + None, + None, + ) + .unwrap_err(); + assert!(err.to_string().contains("invalid VCF position")); + } + #[test] fn zip_entry_limited_reader_rejects_oversized_output() { let mut reader = std::io::Cursor::new(b"abcdef".to_vec()); diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index 93dc0a0..cdd82e4 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -979,15 +979,28 @@ mod tests { #[test] fn inspect_helpers_cover_text_shape_source_and_assembly_edges() { assert_eq!(split_fields("rs1\t1\t2\tAA"), vec!["rs1", "1", "2", "AA"]); - assert_eq!(split_fields("\"rs1\", 1, 2, \"AG\""), vec!["rs1", "1", "2", "AG"]); + assert_eq!( + split_fields("\"rs1\", 1, 2, \"AG\""), + vec!["rs1", "1", "2", "AG"] + ); assert!(looks_like_genotype_text(&[ "// header".to_owned(), "i12345 XY 10 A G".to_owned(), "rs2 chr26 20 DD".to_owned(), ])); assert!(!looks_like_genotype_text(&["not enough fields".to_owned()])); - assert!(!matches_genotype_shape(&["bad".to_owned(), "1".to_owned(), "2".to_owned(), "AA".to_owned()])); - assert!(!matches_genotype_shape(&["rs1".to_owned(), "badchr".to_owned(), "2".to_owned(), "AA".to_owned()])); + assert!(!matches_genotype_shape(&[ + "bad".to_owned(), + "1".to_owned(), + "2".to_owned(), + "AA".to_owned() + ])); + assert!(!matches_genotype_shape(&[ + "rs1".to_owned(), + "badchr".to_owned(), + "2".to_owned(), + "AA".to_owned() + ])); assert!(!is_valid_genotype("")); assert!(!is_valid_genotype("ACGTI")); assert!(!is_valid_allele("N")); @@ -1001,13 +1014,18 @@ mod tests { assert_eq!(gfg.vendor.as_deref(), Some("Genes for Good")); assert_eq!(gfg.platform_version.as_deref(), Some("v1")); - let twenty_three = detect_source("/tmp/v5/23andme.txt", &[], DetectedKind::GenotypeText).unwrap(); + let twenty_three = + detect_source("/tmp/v5/23andme.txt", &[], DetectedKind::GenotypeText).unwrap(); assert_eq!(twenty_three.vendor.as_deref(), Some("23andMe")); assert_eq!(twenty_three.platform_version.as_deref(), Some("v5")); assert_eq!( - detect_source("sequencing.com.vcf", &["##source=sequencing.com".to_owned()], DetectedKind::Vcf) - .unwrap() - .confidence, + detect_source( + "sequencing.com.vcf", + &["##source=sequencing.com".to_owned()], + DetectedKind::Vcf + ) + .unwrap() + .confidence, DetectionConfidence::WeakHeuristic ); assert_eq!( @@ -1024,7 +1042,10 @@ mod tests { Some(Assembly::Grch37) ); assert_eq!( - detect_assembly("sample", &["##contig=".to_owned()]), + detect_assembly( + "sample", + &["##contig=".to_owned()] + ), Some(Assembly::Grch38) ); assert_eq!(detect_assembly("sample", &[]), None); @@ -1038,17 +1059,30 @@ mod tests { ..InspectOptions::default() }; assert_eq!( - detect_index(Path::new("sample.txt"), DetectedKind::GenotypeText, &options), + detect_index( + Path::new("sample.txt"), + DetectedKind::GenotypeText, + &options + ), (Some(false), Some(explicit)) ); let no_ext_ref = Path::new("reference"); assert_eq!( - detect_index(no_ext_ref, DetectedKind::ReferenceFasta, &InspectOptions::default()).1, + detect_index( + no_ext_ref, + DetectedKind::ReferenceFasta, + &InspectOptions::default() + ) + .1, Some(PathBuf::from("reference.fai")) ); assert_eq!( - detect_index(Path::new("sample.dat"), DetectedKind::AlignmentCram, &InspectOptions::default()), + detect_index( + Path::new("sample.dat"), + DetectedKind::AlignmentCram, + &InspectOptions::default() + ), (Some(false), None) ); diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index 2c33a43..67cb999 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -1594,7 +1594,10 @@ mod tests { assert!(ends_with_unescaped_backslash("x = 1\\")); assert!(!ends_with_unescaped_backslash("x = '\\\\'")); - assert_eq!(update_nesting_depth(0, "call(') still string', [1]) # ignored"), 0); + assert_eq!( + update_nesting_depth(0, "call(') still string', [1]) # ignored"), + 0 + ); assert_eq!(update_nesting_depth(0, "call(["), 2); assert_eq!(update_nesting_depth(2, "])"), 0); } @@ -1615,15 +1618,30 @@ mod tests { }; let object = variant_observation_object(&observation); assert!(matches!(attr(&object, "assembly"), Some(MontyObject::String(v)) if v == "grch37")); - assert!(matches!(attr(&object, "ref_count"), Some(MontyObject::Int(3)))); - assert!(matches!(attr(&object, "alt_count"), Some(MontyObject::Int(2)))); + assert!(matches!( + attr(&object, "ref_count"), + Some(MontyObject::Int(3)) + )); + assert!(matches!( + attr(&object, "alt_count"), + Some(MontyObject::Int(2)) + )); assert!(matches!(attr(&object, "depth"), Some(MontyObject::Int(5)))); let missing = variant_observation_object(&bioscript_core::VariantObservation::default()); - assert!(matches!(attr(&missing, "assembly"), Some(MontyObject::None))); - assert!(matches!(attr(&missing, "genotype"), Some(MontyObject::None))); + assert!(matches!( + attr(&missing, "assembly"), + Some(MontyObject::None) + )); + assert!(matches!( + attr(&missing, "genotype"), + Some(MontyObject::None) + )); - assert_eq!(string_or_list(&MontyObject::None).unwrap(), Vec::::new()); + assert_eq!( + string_or_list(&MontyObject::None).unwrap(), + Vec::::new() + ); assert_eq!( string_list_from_object(&MontyObject::None).unwrap(), Vec::::new() @@ -1639,7 +1657,12 @@ mod tests { attrs: vec![].into(), frozen: true, }; - assert!(variant_specs_from_plan(&bad_plan).unwrap_err().to_string().contains("missing variants")); + assert!( + variant_specs_from_plan(&bad_plan) + .unwrap_err() + .to_string() + .contains("missing variants") + ); let bad_variant = MontyObject::Dataclass { name: "Other".to_owned(), @@ -1648,7 +1671,17 @@ mod tests { attrs: vec![].into(), frozen: true, }; - assert!(dataclass_to_variant_spec(&bad_variant).unwrap_err().to_string().contains("got Other")); - assert!(dataclass_to_variant_spec(&MontyObject::None).unwrap_err().to_string().contains("expected Variant object")); + assert!( + dataclass_to_variant_spec(&bad_variant) + .unwrap_err() + .to_string() + .contains("got Other") + ); + assert!( + dataclass_to_variant_spec(&MontyObject::None) + .unwrap_err() + .to_string() + .contains("expected Variant object") + ); } } diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index f645546..104bf94 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -782,6 +782,7 @@ fn remote_resource_resolution_handles_json_versions_and_plain_relative_urls() { } #[test] +#[allow(clippy::too_many_lines)] fn validate_variants_covers_remaining_identity_coordinate_and_allele_edges() { let dir = temp_dir("validate-variant-more-edges"); fs::write( @@ -889,6 +890,7 @@ alleles: } #[test] +#[allow(clippy::too_many_lines)] fn validate_panels_and_loaders_cover_parse_error_edges() { let dir = temp_dir("validate-panel-more-edges"); let non_panel = dir.join("variant.yaml"); @@ -937,8 +939,14 @@ members: let err = load_panel_manifest(&invalid_panel).unwrap_err(); assert!(err.contains("name: missing required field"), "{err}"); assert!(err.contains("downloads[0].id: empty string"), "{err}"); - assert!(err.contains("downloads[0].version: missing required field"), "{err}"); - assert!(err.contains("members[0].kind: missing required field"), "{err}"); + assert!( + err.contains("downloads[0].version: missing required field"), + "{err}" + ); + assert!( + err.contains("members[0].kind: missing required field"), + "{err}" + ); assert!(err.contains("members[0].download: empty string"), "{err}"); assert!(err.contains("members[1]: expected mapping"), "{err}"); @@ -992,7 +1000,10 @@ alleles: "#, ) .unwrap_err(); - assert!(invalid_lookup.contains("schema: expected schema"), "{invalid_lookup}"); + assert!( + invalid_lookup.contains("schema: expected schema"), + "{invalid_lookup}" + ); assert!( invalid_lookup.contains("coordinates.grch38.pos: expected integer >= 1"), "{invalid_lookup}" From 57106aed30814cbda5a564e6bf8644da807a6224 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 16:44:54 +1000 Subject: [PATCH 8/9] increased coverage --- .github/workflows/coverage.yml | 2 +- coverage.sh | 6 +- rust/bioscript-cli/src/main.rs | 231 +++++++++++++++++- rust/bioscript-formats/src/alignment.rs | 120 +++++----- rust/bioscript-formats/src/genotype.rs | 114 +++++++++ rust/bioscript-formats/src/inspect.rs | 171 ++++++++++++++ rust/bioscript-formats/tests/prepare.rs | 54 +++++ rust/bioscript-runtime/src/runtime.rs | 299 ++++++++++++++++++++++++ 8 files changed, 933 insertions(+), 64 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 826ccde..c000012 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - test: [file_formats, formats_lib, inspect, prepare, cli, schema, core, runtime_lib, runtime_security, runtime_resources] + test: [file_formats, formats_lib, inspect, prepare, cli, cli_bin, schema, core, runtime_lib, runtime_security, runtime_resources] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/coverage.sh b/coverage.sh index 89791f2..305a369 100755 --- a/coverage.sh +++ b/coverage.sh @@ -24,7 +24,7 @@ Usage: ./coverage.sh [--full-clean|-c] [--open] [--large] [--all-tests] [--no-li --all-tests Run all tests for the first-party BioScript crates --no-lint Skip cargo fmt and clippy checks --focused-test Run one focused integration test target: - file_formats, formats_lib, inspect, prepare, cli, schema, core, + file_formats, formats_lib, inspect, prepare, cli, cli_bin, schema, core, runtime_lib, runtime_security, or runtime_resources Environment: @@ -172,6 +172,9 @@ if [[ -n "$FOCUSED_TEST" ]]; then cli) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --test cli -- --nocapture --test-threads="$TEST_THREADS" ;; + cli_bin) + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --bin bioscript + ;; schema) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-schema --test validate_variants -- --nocapture --test-threads="$TEST_THREADS" ;; @@ -201,6 +204,7 @@ else env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test inspect -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-formats --test prepare -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --test cli -- --nocapture --test-threads="$TEST_THREADS" + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --bin bioscript env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-schema --test validate_variants -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --lib diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 844f7ef..077ebd2 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -31,7 +31,12 @@ fn main() -> ExitCode { #[allow(clippy::too_many_lines)] fn run_cli() -> Result<(), String> { - let mut args = env::args().skip(1); + run_cli_args(env::args().skip(1).collect()) +} + +#[allow(clippy::too_many_lines)] +fn run_cli_args(raw_args: Vec) -> Result<(), String> { + let mut args = raw_args.clone().into_iter(); if let Some(first) = args.next() { if first == "validate-variants" { return run_validate_variants(args.collect()); @@ -47,7 +52,7 @@ fn run_cli() -> Result<(), String> { } } - let mut args = env::args().skip(1); + let mut args = raw_args.into_iter(); let mut script_path: Option = None; let mut root: Option = None; let mut input_file: Option = None; @@ -861,3 +866,225 @@ fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { loader.reference_index = Some(resolve_cli_path_buf(root, &path)); } } + +#[cfg(test)] +mod tests { + use super::*; + use bioscript_core::{Assembly, VariantObservation}; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-cli-unit-{label}-{}-{nanos}", + std::process::id() + )); + fs::create_dir_all(&dir).unwrap(); + dir + } + + #[test] + fn cli_private_helpers_render_rows_filters_paths_and_loader_paths() { + let root = temp_dir("helpers-root"); + let manifest_path = root.join("panels/panel.yaml"); + let member_dir = root.join("panels/members"); + fs::create_dir_all(&member_dir).unwrap(); + let variant_path = member_dir.join("apol1.yaml"); + fs::write(&manifest_path, "schema: bioscript:panel:1.0\n").unwrap(); + fs::write(&variant_path, "schema: bioscript:variant:1.0\n").unwrap(); + + let manifest = VariantManifest { + name: "APOL1 G1".to_owned(), + path: variant_path.clone(), + tags: vec!["kidney".to_owned(), "apol1".to_owned()], + spec: bioscript_core::VariantSpec::default(), + }; + + assert!(matches_filters( + &manifest, + &variant_path, + &[ + "kind=variant".to_owned(), + "name=APOL1".to_owned(), + "tag=kidney".to_owned(), + "path=apol1".to_owned(), + ], + )); + assert!(!matches_filters( + &manifest, + &variant_path, + &["kind=panel".to_owned()] + )); + assert!(!matches_filters( + &manifest, + &variant_path, + &["bad".to_owned()] + )); + + assert_eq!( + resolve_manifest_path(&root, &manifest_path, "members/apol1.yaml").unwrap(), + variant_path.canonicalize().unwrap() + ); + let outside = temp_dir("helpers-outside").join("escape.yaml"); + fs::write(&outside, "schema: bioscript:variant:1.0\n").unwrap(); + let err = + resolve_manifest_path(&root, &manifest_path, &outside.to_string_lossy()).unwrap_err(); + assert!(err.contains("escapes bioscript root"), "{err}"); + + let observation = VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: Some("rs1".to_owned()), + assembly: Some(Assembly::Grch38), + genotype: Some("AG".to_owned()), + ref_count: Some(7), + alt_count: Some(3), + depth: Some(10), + evidence: vec!["one\twith tab".to_owned(), "two".to_owned()], + ..VariantObservation::default() + }; + let row = variant_row( + &root, + &variant_path, + "APOL1 G1", + &["kidney".to_owned()], + &observation, + Some("p1"), + ); + let tsv = render_rows_as_tsv(&[row]); + assert!(tsv.contains("participant_id\tbackend"), "{tsv}"); + assert!(tsv.contains("p1\tvcf\trs1\tgrch38\tAG\t7\t3\t10"), "{tsv}"); + assert!(tsv.contains("one with tab | two"), "{tsv}"); + + assert_eq!( + resolve_cli_path(&root, "sample.txt"), + root.join("sample.txt").display().to_string() + ); + assert_eq!( + resolve_cli_path_buf(&root, Path::new("/tmp/abs")), + PathBuf::from("/tmp/abs") + ); + + let mut loader = GenotypeLoadOptions { + input_index: Some(PathBuf::from("input.crai")), + reference_file: Some(PathBuf::from("ref.fa")), + reference_index: Some(PathBuf::from("ref.fa.fai")), + ..GenotypeLoadOptions::default() + }; + normalize_loader_paths(&root, &mut loader); + assert_eq!( + loader.input_index.as_deref(), + Some(root.join("input.crai").as_path()) + ); + assert_eq!( + loader.reference_file.as_deref(), + Some(root.join("ref.fa").as_path()) + ); + assert_eq!( + loader.reference_index.as_deref(), + Some(root.join("ref.fa.fai").as_path()) + ); + } + + #[test] + fn cli_private_helpers_cover_manifest_schema_and_timing_errors() { + let dir = temp_dir("schema-timing"); + let valid = dir.join("valid.yaml"); + let missing_schema = dir.join("missing.yaml"); + let invalid_yaml = dir.join("invalid.yaml"); + fs::write(&valid, "schema: bioscript:variant:1.0\n").unwrap(); + fs::write(&missing_schema, "name: no schema\n").unwrap(); + fs::write(&invalid_yaml, "schema: [").unwrap(); + + assert_eq!(manifest_schema(&valid).unwrap(), "bioscript:variant:1.0"); + assert!( + manifest_schema(&missing_schema) + .unwrap_err() + .contains("missing schema") + ); + assert!( + manifest_schema(&invalid_yaml) + .unwrap_err() + .contains("failed to parse YAML") + ); + assert!( + manifest_schema(&dir.join("absent.yaml")) + .unwrap_err() + .contains("failed to read") + ); + + let timing_path = dir.join("nested/timing.tsv"); + write_timing_report( + &timing_path, + &[ + StageTiming { + stage: "one".to_owned(), + duration_ms: 2, + detail: "contains\ttab".to_owned(), + }, + StageTiming { + stage: "two".to_owned(), + duration_ms: 3, + detail: "plain".to_owned(), + }, + ], + ) + .unwrap(); + let report = fs::read_to_string(&timing_path).unwrap(); + assert!(report.contains("stage\tduration_ms\tdetail")); + assert!(report.contains("one\t2\tcontains tab")); + } + + #[test] + fn cli_arg_parser_reports_missing_and_invalid_values_without_spawning() { + for (flag, expected) in [ + ("--input-format", "--input-format requires a value"), + ("--max-duration-ms", "--max-duration-ms requires an integer"), + ( + "--max-memory-bytes", + "--max-memory-bytes requires an integer", + ), + ("--max-allocations", "--max-allocations requires an integer"), + ( + "--max-recursion-depth", + "--max-recursion-depth requires an integer", + ), + ] { + let err = run_cli_args(vec![flag.to_owned()]).unwrap_err(); + assert!(err.contains(expected), "{flag}: {err}"); + } + + for (flag, value, expected) in [ + ( + "--input-format", + "unknown", + "invalid --input-format value unknown", + ), + ( + "--max-duration-ms", + "nan", + "invalid --max-duration-ms value nan", + ), + ( + "--max-memory-bytes", + "nan", + "invalid --max-memory-bytes value nan", + ), + ( + "--max-allocations", + "nan", + "invalid --max-allocations value nan", + ), + ( + "--max-recursion-depth", + "nan", + "invalid --max-recursion-depth value nan", + ), + ] { + let err = run_cli_args(vec![flag.to_owned(), value.to_owned()]).unwrap_err(); + assert!(err.contains(expected), "{flag}: {err}"); + } + } +} diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index 7185b04..0c97464 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -5,7 +5,7 @@ use std::{ }; use noodles::{ - core::{Position, Region}, + core::{Position, Region, region::Interval}, cram::{self, crai, io::reader::Container}, fasta::{self, repository::adapters::IndexedReader as FastaIndexedReader}, sam::{ @@ -465,34 +465,15 @@ where &external_data_srcs, true, |record| { - let alignment_record = match build_alignment_record_from_cram(label, record) { - Ok(r) => r, - Err(e) => { - callback_err = Some(e); - return Ok(false); - } - }; - - if alignment_record.start > locus_end { - stop = true; - return Ok(false); - } - - if !alignment_record_intersects_interval(&alignment_record, interval) { - return Ok(true); - } - - match on_record(record.clone()) { - Ok(true) => Ok(true), - Ok(false) => { - stop = true; - Ok(false) - } - Err(e) => { - callback_err = Some(e); - Ok(false) - } - } + Ok(handle_decoded_cram_record( + label, + record, + interval, + locus_end, + &mut stop, + &mut callback_err, + on_record, + )) }, ); @@ -516,38 +497,15 @@ where &external_data_srcs, false, |record| { - let alignment_record = - match build_alignment_record_from_cram(label, record) { - Ok(r) => r, - Err(e) => { - callback_err = Some(e); - return Ok(false); - } - }; - - if alignment_record.start > locus_end { - stop = true; - return Ok(false); - } - - if !alignment_record_intersects_interval( - &alignment_record, + Ok(handle_decoded_cram_record( + label, + record, interval, - ) { - return Ok(true); - } - - match on_record(record.clone()) { - Ok(true) => Ok(true), - Ok(false) => { - stop = true; - Ok(false) - } - Err(e) => { - callback_err = Some(e); - Ok(false) - } - } + locus_end, + &mut stop, + &mut callback_err, + on_record, + )) }, ) .map_err(|err| { @@ -585,6 +543,48 @@ where Ok(()) } +fn handle_decoded_cram_record( + label: &str, + record: &cram::Record<'_>, + interval: Interval, + locus_end: i64, + stop: &mut bool, + callback_err: &mut Option, + on_record: &mut F, +) -> bool +where + F: FnMut(cram::Record<'_>) -> Result, +{ + let alignment_record = match build_alignment_record_from_cram(label, record) { + Ok(record) => record, + Err(err) => { + *callback_err = Some(err); + return false; + } + }; + + if alignment_record.start > locus_end { + *stop = true; + return false; + } + + if !alignment_record_intersects_interval(&alignment_record, interval) { + return true; + } + + match on_record(record.clone()) { + Ok(true) => true, + Ok(false) => { + *stop = true; + false + } + Err(err) => { + *callback_err = Some(err); + false + } + } +} + fn is_reference_md5_mismatch(err: &std::io::Error) -> bool { err.to_string() .contains("reference sequence checksum mismatch") diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index f0548cc..b7173fd 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -2669,6 +2669,16 @@ mod tests { choose_variant_locus(&variant, Path::new("ref/hg19.fa")), Some((Assembly::Grch37, locus("1", 10, 10))) ); + assert_eq!( + choose_variant_locus( + &VariantSpec { + grch38: Some(locus("3", 30, 30)), + ..VariantSpec::default() + }, + Path::new("ref/hg19.fa") + ), + Some((Assembly::Grch38, locus("3", 30, 30))) + ); assert_eq!( choose_variant_locus(&variant, Path::new("ref/unknown.fa")), Some((Assembly::Grch38, locus("2", 20, 20))) @@ -2771,10 +2781,21 @@ mod tests { let short_row = parser.consume_record("bad,row").unwrap().unwrap(); assert_eq!(short_row.rsid.as_deref(), Some("bad")); assert_eq!(short_row.genotype, "--"); + assert_eq!( + parser.consume_line("rs4,4,40,tt").unwrap(), + Some(("rs4".to_owned(), "TT".to_owned())) + ); + assert!(parser.consume_record(",,").unwrap().is_none()); + assert_eq!(parser.default_header(2), vec!["rsid", "chromosome"]); assert_eq!(parser.default_header(6).len(), 6); let mut indexes = None; let mut comment_header = None; + assert!( + parse_streaming_row("", Delimiter::Space, &mut indexes, &mut comment_header) + .unwrap() + .is_none() + ); assert!( parse_streaming_row( "// marker chromosome position result", @@ -2918,6 +2939,24 @@ mod tests { &VariantSpec::default(), None )); + assert!(!vcf_row_matches_variant( + &row, + &VariantSpec { + grch38: Some(locus("2", 10, 10)), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + Some(Assembly::Grch38) + )); + assert!(vcf_row_matches_variant( + &row, + &VariantSpec { + grch38: Some(locus("1", 10, 10)), + kind: Some(VariantKind::Other), + ..VariantSpec::default() + }, + Some(Assembly::Grch38) + )); } #[test] @@ -3092,6 +3131,81 @@ mod tests { ); } + #[test] + fn genotype_public_entry_points_cover_in_memory_sources_and_fallbacks() { + let text_store = + GenotypeStore::from_bytes("sample.txt", b"rsid genotype\nrs1 AG\nrs2 CT\n").unwrap(); + assert_eq!(text_store.backend_name(), "text"); + assert!(text_store.supports(QueryKind::GenotypeByRsid)); + assert!(!text_store.supports(QueryKind::GenotypeByLocus)); + assert_eq!(text_store.get("rs1").unwrap().as_deref(), Some("AG")); + let observations = text_store + .lookup_variants(&[ + VariantSpec { + rsids: vec!["rs2".to_owned()], + ..VariantSpec::default() + }, + VariantSpec { + rsids: vec!["missing".to_owned()], + ..VariantSpec::default() + }, + ]) + .unwrap(); + assert_eq!(observations[0].genotype.as_deref(), Some("CT")); + assert!(observations[1].evidence[0].contains("no matching rsid")); + + let vcf_store = GenotypeStore::from_bytes( + "sample.vcf", + b"##fileformat=VCFv4.3\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs10\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ + 1\t20\t.\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ + 1\t30\trs_bad\t.\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + assert_eq!(vcf_store.backend_name(), "vcf"); + assert_eq!(vcf_store.get("rs10").unwrap().as_deref(), Some("AG")); + + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .add_directory("nested/", SimpleFileOptions::default()) + .unwrap(); + writer + .start_file("nested/sample.vcf", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all( + b"##fileformat=VCFv4.3\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 2\t20\trs20\tC\tT\t.\tPASS\t.\tGT\t1/1\n", + ) + .unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + let zip_store = GenotypeStore::from_bytes("sample.zip", &bytes).unwrap(); + assert_eq!(zip_store.backend_name(), "vcf"); + assert_eq!(zip_store.get("rs20").unwrap().as_deref(), Some("TT")); + + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .add_directory("empty/", SimpleFileOptions::default()) + .unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + let err = GenotypeStore::from_bytes("empty.zip", &bytes).unwrap_err(); + assert!( + err.to_string() + .contains("does not contain a supported genotype file"), + "{err}" + ); + + assert!( + GenotypeSourceFormat::from_str("unknown") + .unwrap_err() + .contains("unsupported input format") + ); + } + #[test] fn genotype_private_helpers_cover_vcf_file_zip_and_error_paths() { let dir = temp_dir("vcf-file-zip-errors"); diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index cdd82e4..1c70dea 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -963,6 +963,7 @@ fn render_bool(value: Option) -> &'static str { #[cfg(test)] mod tests { use super::*; + use std::io::Write as _; use std::path::PathBuf; #[test] @@ -1096,4 +1097,174 @@ mod tests { assert_eq!(render_bool(Some(false)), "false"); assert_eq!(render_bool(None), ""); } + + #[test] + fn inspect_helpers_cover_bgzip_zip_and_index_edges() { + let mut bgzf_writer = bgzf::io::Writer::new(Vec::new()); + bgzf_writer + .write_all( + b"##fileformat=VCFv4.3\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + chr1\t10\trs10\tA\tG\t.\tPASS\t.\tGT\t0|1\n", + ) + .unwrap(); + let bgzf_vcf = bgzf_writer.finish().unwrap(); + + let bgzip_inspection = + inspect_bytes("sample.vcf.gz", &bgzf_vcf, &InspectOptions::default()).unwrap(); + assert_eq!(bgzip_inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(bgzip_inspection.phased, Some(true)); + + let cursor = Cursor::new(Vec::new()); + let mut zip_writer = zip::ZipWriter::new(cursor); + zip_writer + .add_directory("nested/", zip::write::SimpleFileOptions::default()) + .unwrap(); + zip_writer + .start_file( + "nested/sample.vcf.gz", + zip::write::SimpleFileOptions::default(), + ) + .unwrap(); + zip_writer.write_all(&bgzf_vcf).unwrap(); + let zip_bytes = zip_writer.finish().unwrap().into_inner(); + let zip_inspection = + inspect_bytes("archive.zip", &zip_bytes, &InspectOptions::default()).unwrap(); + assert_eq!(zip_inspection.container, FileContainer::Zip); + assert_eq!(zip_inspection.detected_kind, DetectedKind::Vcf); + assert_eq!( + zip_inspection.selected_entry.as_deref(), + Some("nested/sample.vcf.gz") + ); + + let missing = read_zip_sample_lines_from_bytes(&zip_bytes, "missing.vcf").unwrap_err(); + assert!(missing.to_string().contains("failed to open zip entry")); + + let dir = + std::env::temp_dir().join(format!("bioscript-inspect-unit-{}", std::process::id())); + std::fs::create_dir_all(&dir).unwrap(); + let cram_no_ext = dir.join("sample.dat"); + let bam_short = dir.join("reads.bam"); + let short_bai = dir.join("reads.bai"); + std::fs::write(&cram_no_ext, b"cram").unwrap(); + std::fs::write(&bam_short, b"bam").unwrap(); + std::fs::write(&short_bai, b"bai").unwrap(); + + assert_eq!( + detect_index( + &cram_no_ext, + DetectedKind::AlignmentCram, + &InspectOptions::default() + ), + (Some(false), None) + ); + assert_eq!( + detect_index( + &bam_short, + DetectedKind::AlignmentBam, + &InspectOptions::default() + ), + (Some(true), Some(short_bai)) + ); + assert_eq!( + classify_confidence(DetectedKind::Vcf, &[], None), + DetectionConfidence::StrongHeuristic + ); + } + + #[test] + fn inspect_helpers_cover_unheaded_text_zip_fallbacks_and_render_edges() { + let unheaded = inspect_bytes( + "sample.txt", + b"rs123\t1\t12345\tAG\n", + &InspectOptions::default(), + ) + .unwrap(); + assert_eq!(unheaded.detected_kind, DetectedKind::GenotypeText); + assert!( + unheaded + .evidence + .iter() + .any(|line| line == "genotype-like sampled rows") + ); + + let dir = + std::env::temp_dir().join(format!("bioscript-inspect-more-{}", std::process::id())); + std::fs::create_dir_all(&dir).unwrap(); + let unknown_path = dir.join("unknown.dat"); + std::fs::write(&unknown_path, b"not enough structure\n").unwrap(); + let unknown = inspect_file(&unknown_path, &InspectOptions::default()).unwrap(); + assert_eq!(unknown.detected_kind, DetectedKind::Unknown); + assert!(unknown.warnings[0].contains("known textual heuristics")); + + let mut bgzf_writer = bgzf::io::Writer::new(Vec::new()); + bgzf_writer + .write_all( + b"##fileformat=VCFv4.3\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + chr1\t10\trs10\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + let bgzf_vcf = bgzf_writer.finish().unwrap(); + let vcf_gz_path = dir.join("sample.vcf.gz"); + std::fs::write(&vcf_gz_path, &bgzf_vcf).unwrap(); + assert_eq!(read_plain_sample_lines(&vcf_gz_path).unwrap().len(), 3); + + let zip_path = dir.join("fallback.zip"); + let cursor = Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .add_directory("__MACOSX/", zip::write::SimpleFileOptions::default()) + .unwrap(); + writer + .start_file("notes.bin", zip::write::SimpleFileOptions::default()) + .unwrap(); + writer.write_all(b"fallback bytes\n").unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + std::fs::write(&zip_path, &bytes).unwrap(); + assert_eq!(select_zip_entry(&zip_path).unwrap(), "notes.bin"); + + let zip_gz_path = dir.join("vcf-gz.zip"); + let cursor = Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .start_file( + "nested/sample.vcf.gz", + zip::write::SimpleFileOptions::default(), + ) + .unwrap(); + writer.write_all(&bgzf_vcf).unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + std::fs::write(&zip_gz_path, &bytes).unwrap(); + assert_eq!( + read_zip_sample_lines(&zip_gz_path, "nested/sample.vcf.gz") + .unwrap() + .len(), + 3 + ); + + let empty_zip_path = dir.join("empty.zip"); + let cursor = Cursor::new(Vec::new()); + let writer = zip::ZipWriter::new(cursor); + let bytes = writer.finish().unwrap().into_inner(); + std::fs::write(&empty_zip_path, bytes).unwrap(); + let err = select_zip_entry(&empty_zip_path).unwrap_err(); + assert!( + err.to_string() + .contains("does not contain a supported file") + ); + + let source = detect_source( + "dynamicdna.txt", + &["# Dynamic DNA GSAv3 report".to_owned()], + DetectedKind::GenotypeText, + ) + .unwrap(); + assert_eq!(source.platform_version.as_deref(), Some("GSAv3")); + assert_eq!(canonicalize_ancestry_version("v2"), "V2"); + assert_eq!(render_kind(DetectedKind::AlignmentCram), "alignment_cram"); + assert_eq!(render_kind(DetectedKind::AlignmentBam), "alignment_bam"); + assert_eq!(render_assembly(Some(Assembly::Grch38)), "grch38"); + assert_eq!(render_bool(Some(true)), "true"); + } } diff --git a/rust/bioscript-formats/tests/prepare.rs b/rust/bioscript-formats/tests/prepare.rs index d533b41..ce4ef62 100644 --- a/rust/bioscript-formats/tests/prepare.rs +++ b/rust/bioscript-formats/tests/prepare.rs @@ -1,5 +1,7 @@ use std::{ + collections::hash_map::DefaultHasher, fs, + hash::{Hash, Hasher}, path::PathBuf, time::{SystemTime, UNIX_EPOCH}, }; @@ -261,6 +263,58 @@ fn fasta_index_is_generated_in_cache_when_missing() { assert!(cached_index.exists()); } +#[test] +fn cached_fasta_reference_and_index_are_reused() { + let root = temp_dir("cached-fasta-root"); + let cwd = temp_dir("cached-fasta-cwd"); + fs::write(root.join("ref.fa"), b">chr1\nACGT\n").unwrap(); + + let mut req = request(root.clone(), cwd.clone(), PathBuf::from("cache")); + req.reference_file = Some("ref.fa".to_owned()); + let first = prepare_indexes(&req).unwrap(); + let cached_reference = first.reference_file.unwrap(); + let cached_index = first.reference_index.unwrap(); + assert!(cached_reference.exists()); + assert!(cached_index.exists()); + + let second = prepare_indexes(&req).unwrap(); + assert_eq!( + second.reference_file.as_deref(), + Some(cached_reference.as_path()) + ); + assert_eq!( + second.reference_index.as_deref(), + Some(cached_index.as_path()) + ); + assert!(shell_flags(&second).contains("--reference-file")); +} + +#[test] +fn cached_cram_index_is_reused_when_present() { + let root = temp_dir("cached-cram-root"); + let cwd = temp_dir("cached-cram-cwd"); + let cache = cwd.join("cache"); + fs::create_dir_all(&cache).unwrap(); + let cram = root.join("sample.cram"); + fs::write(&cram, b"not a real cram").unwrap(); + + let canonical = cram.canonicalize().unwrap(); + let mut hasher = DefaultHasher::new(); + canonical.to_string_lossy().hash(&mut hasher); + let hash = hasher.finish(); + let cached_index = cache.join(format!("sample.cram-{hash:016x}.crai")); + fs::write(&cached_index, b"existing crai").unwrap(); + + let mut req = request(root, cwd, PathBuf::from("cache")); + req.input_file = Some("sample.cram".to_owned()); + let prepared = prepare_indexes(&req).unwrap(); + + assert_eq!( + prepared.input_index.as_deref(), + Some(cached_index.as_path()) + ); +} + #[test] fn fasta_without_extension_uses_fai_extension() { let root = temp_dir("fasta-no-extension-root"); diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index 67cb999..d67d342 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -1541,6 +1541,20 @@ fn update_nesting_depth(mut depth: usize, line: &str) -> usize { #[cfg(test)] mod tests { use super::*; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-runtime-unit-{label}-{}-{nanos}", + std::process::id() + )); + fs::create_dir_all(&dir).unwrap(); + dir + } fn attr<'a>(obj: &'a MontyObject, name: &str) -> Option<&'a MontyObject> { let MontyObject::Dataclass { attrs, .. } = obj else { @@ -1684,4 +1698,289 @@ mod tests { .contains("expected Variant object") ); } + + #[test] + fn runtime_private_methods_cover_dispatch_and_path_errors() { + let root = temp_dir("dispatch"); + fs::write(root.join("input.txt"), "hello").unwrap(); + let runtime = BioscriptRuntime::new(&root).unwrap(); + let bioscript = MontyObject::Dataclass { + name: "Bioscript".to_owned(), + type_id: 1, + field_names: vec![], + attrs: vec![].into(), + frozen: true, + }; + + let err = runtime + .dispatch_method_call("missing", std::slice::from_ref(&bioscript), &[]) + .unwrap_err(); + assert!(err.to_string().contains("no attribute")); + + assert!( + runtime + .method_read_text(&[], &[]) + .unwrap_err() + .to_string() + .contains("expects self and path") + ); + assert!( + runtime + .method_write_text(&[], &[]) + .unwrap_err() + .to_string() + .contains("expects self, path, text") + ); + assert!( + runtime + .resolve_user_path("/absolute") + .unwrap_err() + .to_string() + .contains("absolute paths") + ); + assert!( + runtime + .resolve_user_path("../escape") + .unwrap_err() + .to_string() + .contains("escapes bioscript root") + ); + assert!( + runtime + .resolve_existing_user_path("missing.txt") + .unwrap_err() + .to_string() + .contains("failed to resolve") + ); + + let nested = runtime + .resolve_user_write_path("new/deep/file.txt") + .unwrap(); + assert_eq!( + nested, + root.canonicalize().unwrap().join("new/deep/file.txt") + ); + assert_eq!( + deepest_existing_ancestor(&root.join("new/deep/file.txt")), + root.as_path() + ); + + let mut config = RuntimeConfig::default(); + config.loader.input_index = Some(PathBuf::from("input.txt")); + config.loader.reference_file = Some(PathBuf::from("/tmp/reference.fa")); + let runtime = BioscriptRuntime::with_config(&root, config).unwrap(); + let loader = runtime.resolved_loader_options().unwrap(); + assert_eq!( + loader.input_index.as_deref(), + Some(root.canonicalize().unwrap().join("input.txt").as_path()) + ); + assert_eq!( + loader.reference_file.as_deref(), + Some(Path::new("/tmp/reference.fa")) + ); + } + + #[test] + fn runtime_private_methods_cover_unknown_genotype_handles() { + let root = temp_dir("handles"); + let runtime = BioscriptRuntime::new(&root).unwrap(); + let genotype = genotype_file_object(99); + let variant = variant_object(&VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }); + let plan = variant_plan_object(&[VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }]); + + for (method, args) in [ + ( + "get", + vec![genotype.clone(), MontyObject::String("rs1".to_owned())], + ), + ("lookup_variant", vec![genotype.clone(), variant.clone()]), + ( + "lookup_variant_details", + vec![genotype.clone(), variant.clone()], + ), + ("lookup_variants", vec![genotype.clone(), plan.clone()]), + ("lookup_variants_details", vec![genotype.clone(), plan]), + ] { + let err = runtime + .dispatch_method_call(method, &args, &[]) + .unwrap_err() + .to_string(); + assert!(err.contains("unknown genotype handle"), "{method}: {err}"); + } + + assert!( + dataclass_handle_id(&MontyObject::None, "GenotypeFile") + .unwrap_err() + .to_string() + .contains("expected GenotypeFile object") + ); + let missing_handle = MontyObject::Dataclass { + name: "GenotypeFile".to_owned(), + type_id: 2, + field_names: vec![], + attrs: vec![].into(), + frozen: true, + }; + assert!( + dataclass_handle_id(&missing_handle, "GenotypeFile") + .unwrap_err() + .to_string() + .contains("missing handle_id") + ); + } + + #[test] + fn runtime_private_methods_cover_successful_genotype_paths() { + let root = temp_dir("success-paths"); + fs::write( + root.join("genotypes.tsv"), + "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\nrs2\t2\t20\tCT\n", + ) + .unwrap(); + let runtime = BioscriptRuntime::new(&root).unwrap(); + let bioscript = bioscript_object(); + + let genotype = runtime + .method_load_genotypes( + &[ + bioscript.clone(), + MontyObject::String("genotypes.tsv".to_owned()), + ], + &[], + ) + .unwrap(); + assert!(matches!( + attr(&genotype, "handle_id"), + Some(MontyObject::Int(id)) if *id > 0 + )); + + let value = runtime + .method_genotype_get( + &[genotype.clone(), MontyObject::String("rs1".to_owned())], + &[], + ) + .unwrap(); + assert!(matches!(value, MontyObject::String(ref text) if text == "AG")); + let missing = runtime + .method_genotype_get( + &[genotype.clone(), MontyObject::String("missing".to_owned())], + &[], + ) + .unwrap(); + assert!(matches!(missing, MontyObject::None)); + + let variant = runtime + .method_variant( + std::slice::from_ref(&bioscript), + &[ + ( + MontyObject::String("rsid".to_owned()), + MontyObject::String("rs2".to_owned()), + ), + ( + MontyObject::String("grch38".to_owned()), + MontyObject::String("2:20".to_owned()), + ), + ( + MontyObject::String("kind".to_owned()), + MontyObject::String("snp".to_owned()), + ), + ], + ) + .unwrap(); + let lookup = runtime + .method_genotype_lookup_variant(&[genotype.clone(), variant.clone()], &[]) + .unwrap(); + assert!(matches!(lookup, MontyObject::String(ref text) if text == "CT")); + let details = runtime + .method_genotype_lookup_variant_details(&[genotype.clone(), variant.clone()], &[]) + .unwrap(); + assert!(matches!( + attr(&details, "matched_rsid"), + Some(MontyObject::String(text)) if text == "rs2" + )); + + let plan = runtime + .method_query_plan(&[bioscript.clone(), MontyObject::List(vec![variant])], &[]) + .unwrap(); + let values = runtime + .method_genotype_lookup_variants(&[genotype.clone(), plan.clone()], &[]) + .unwrap(); + assert!(matches!(values, MontyObject::List(items) if items.len() == 1)); + let detail_values = runtime + .method_genotype_lookup_variants_details(&[genotype, plan], &[]) + .unwrap(); + assert!(matches!(detail_values, MontyObject::List(items) if items.len() == 1)); + assert!(runtime.timing_snapshot().len() >= 4); + } + + #[test] + fn runtime_private_methods_cover_successful_text_tsv_and_trace_paths() { + let root = temp_dir("host-output"); + let runtime = BioscriptRuntime::new(&root).unwrap(); + let bioscript = bioscript_object(); + let rows = MontyObject::List(vec![MontyObject::Dict( + vec![ + ( + MontyObject::String("rsid".to_owned()), + MontyObject::String("rs1".to_owned()), + ), + (MontyObject::String("count".to_owned()), MontyObject::Int(2)), + ( + MontyObject::String("ok".to_owned()), + MontyObject::Bool(true), + ), + (MontyObject::String("note".to_owned()), MontyObject::None), + ] + .into(), + )]); + runtime + .method_write_tsv( + &[ + bioscript.clone(), + MontyObject::String("out/table.tsv".to_owned()), + rows, + ], + &[], + ) + .unwrap(); + let table = fs::read_to_string(root.join("out/table.tsv")).unwrap(); + assert!(table.contains("rs1")); + assert!(table.contains("true")); + + runtime + .method_write_text( + &[ + bioscript.clone(), + MontyObject::String("out/text.txt".to_owned()), + MontyObject::String("hello".to_owned()), + ], + &[], + ) + .unwrap(); + let text = runtime + .method_read_text( + &[bioscript, MontyObject::String("out/text.txt".to_owned())], + &[], + ) + .unwrap(); + assert!(matches!(text, MontyObject::String(ref value) if value == "hello")); + + runtime.state.trace_lines.lock().unwrap().extend([1, 2, 99]); + runtime + .write_trace_report( + &root.join("trace/report.tsv"), + "bioscript.variant(rsid='rs1')\nplain = 1\n", + ) + .unwrap(); + let trace = fs::read_to_string(root.join("trace/report.tsv")).unwrap(); + assert!(trace.contains("https://www.ncbi.nlm.nih.gov/snp/rs1")); + assert_eq!(runtime.timing_snapshot().len(), 1); + } } From d320fd9658bfdf6f2bfb313cdd429e2eabc11c46 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 30 Apr 2026 17:27:34 +1000 Subject: [PATCH 9/9] modularizing files --- AGENTS.md | 21 + coverage.sh | 2 + rust/bioscript-cli/src/commands.rs | 182 ++ rust/bioscript-cli/src/main.rs | 609 +------ rust/bioscript-cli/src/manifest.rs | 336 ++++ rust/bioscript-cli/src/paths.rs | 57 + rust/bioscript-cli/tests/cli.rs | 981 +---------- rust/bioscript-cli/tests/cli/args.rs | 206 +++ rust/bioscript-cli/tests/cli/manifests.rs | 404 +++++ rust/bioscript-cli/tests/cli/runtime.rs | 204 +++ rust/bioscript-cli/tests/cli/subcommands.rs | 164 ++ rust/bioscript-core/tests/source_size.rs | 179 ++ rust/bioscript-formats/src/alignment.rs | 644 +------ .../src/alignment/cram_stream.rs | 498 ++++++ .../src/alignment/readers.rs | 115 ++ rust/bioscript-formats/src/genotype.rs | 62 +- rust/bioscript-formats/tests/file_formats.rs | 1483 +---------------- .../tests/file_formats/alignment.rs | 76 + .../tests/file_formats/basic.rs | 269 +++ .../tests/file_formats/cram.rs | 574 +++++++ .../tests/file_formats/delimited.rs | 103 ++ .../tests/file_formats/vcf.rs | 230 +++ .../tests/file_formats/zip_and_fixtures.rs | 230 +++ rust/bioscript-wasm/src/inspect_api.rs | 159 ++ rust/bioscript-wasm/src/lib.rs | 629 +------ rust/bioscript-wasm/src/lookup_api.rs | 334 ++++ rust/bioscript-wasm/src/variant_yaml.rs | 86 + test.sh | 11 +- 28 files changed, 4564 insertions(+), 4284 deletions(-) create mode 100644 AGENTS.md create mode 100644 rust/bioscript-cli/src/commands.rs create mode 100644 rust/bioscript-cli/src/manifest.rs create mode 100644 rust/bioscript-cli/src/paths.rs create mode 100644 rust/bioscript-cli/tests/cli/args.rs create mode 100644 rust/bioscript-cli/tests/cli/manifests.rs create mode 100644 rust/bioscript-cli/tests/cli/runtime.rs create mode 100644 rust/bioscript-cli/tests/cli/subcommands.rs create mode 100644 rust/bioscript-core/tests/source_size.rs create mode 100644 rust/bioscript-formats/src/alignment/cram_stream.rs create mode 100644 rust/bioscript-formats/src/alignment/readers.rs create mode 100644 rust/bioscript-formats/tests/file_formats/alignment.rs create mode 100644 rust/bioscript-formats/tests/file_formats/basic.rs create mode 100644 rust/bioscript-formats/tests/file_formats/cram.rs create mode 100644 rust/bioscript-formats/tests/file_formats/delimited.rs create mode 100644 rust/bioscript-formats/tests/file_formats/vcf.rs create mode 100644 rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs create mode 100644 rust/bioscript-wasm/src/inspect_api.rs create mode 100644 rust/bioscript-wasm/src/lookup_api.rs create mode 100644 rust/bioscript-wasm/src/variant_yaml.rs diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..25de3cc --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,21 @@ +# BioScript Agent Notes + +## Source Size Heuristic + +Keep first-party production Rust source files at or below 500 lines. This applies +to files under `rust/bioscript-*/src/**/*.rs`. + +The 500-line rule does not apply to: + +- integration tests and unit-test modules +- vendored code and patched path dependencies +- generated code, if any is added later + +Put substantial test coverage in separate test modules under `tests/` so the +production limit measures production code, not test scaffolding. Test files +should still be split when they mix unrelated behavior or become hard to scan. + +When a production file grows past 500 lines, split it before adding more +behavior. Temporary exceptions must be listed in +`rust/bioscript-core/tests/source_size.rs` with their current line count, and +that count should not increase. diff --git a/coverage.sh b/coverage.sh index 305a369..a3f9a4d 100755 --- a/coverage.sh +++ b/coverage.sh @@ -180,6 +180,7 @@ if [[ -n "$FOCUSED_TEST" ]]; then ;; core) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --test source_size -- --nocapture --test-threads="$TEST_THREADS" ;; runtime_lib) env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --lib @@ -207,6 +208,7 @@ else env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-cli --bin bioscript env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-schema --test validate_variants -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --lib + env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-core --test source_size -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --lib env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test security -- --nocapture --test-threads="$TEST_THREADS" env "${COV_ENV[@]}" cargo llvm-cov --no-report -p bioscript-runtime --test resources_coverage -- --nocapture --test-threads="$TEST_THREADS" diff --git a/rust/bioscript-cli/src/commands.rs b/rust/bioscript-cli/src/commands.rs new file mode 100644 index 0000000..933abb9 --- /dev/null +++ b/rust/bioscript-cli/src/commands.rs @@ -0,0 +1,182 @@ +use std::{env, fs, path::PathBuf}; + +use bioscript_formats::{ + GenotypeSourceFormat, InspectOptions, PrepareRequest, inspect_file, prepare_indexes, + shell_flags, +}; +use bioscript_schema::{validate_panels_path, validate_variants_path}; + +pub(crate) fn run_prepare(args: Vec) -> Result<(), String> { + let mut root: Option = None; + let mut input_file: Option = None; + let mut reference_file: Option = None; + let mut input_format: Option = None; + let mut cache_dir: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--root" => { + root = Some(PathBuf::from( + iter.next().ok_or("--root requires a directory")?, + )); + } + "--input-file" => { + input_file = Some(iter.next().ok_or("--input-file requires a path")?); + } + "--reference-file" => { + reference_file = Some(iter.next().ok_or("--reference-file requires a path")?); + } + "--input-format" => { + let value = iter.next().ok_or("--input-format requires a value")?; + if !value.eq_ignore_ascii_case("auto") { + input_format = Some( + value + .parse::() + .map_err(|err| format!("invalid --input-format: {err}"))?, + ); + } + } + "--cache-dir" => { + cache_dir = Some(PathBuf::from( + iter.next().ok_or("--cache-dir requires a path")?, + )); + } + other => { + return Err(format!("unexpected argument: {other}")); + } + } + } + + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_root = root.unwrap_or_else(|| cwd.clone()); + let effective_cache = cache_dir.unwrap_or_else(|| cwd.join(".bioscript-cache")); + + let request = PrepareRequest { + root: effective_root, + cwd, + cache_dir: effective_cache, + input_file, + input_format, + reference_file, + }; + + let prepared = prepare_indexes(&request)?; + let flags = shell_flags(&prepared); + if flags.is_empty() { + eprintln!("bioscript prepare: nothing to index"); + } else { + println!("{flags}"); + } + + Ok(()) +} + +pub(crate) fn run_inspect(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut options = InspectOptions::default(); + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--input-index" => { + options.input_index = Some(PathBuf::from( + iter.next().ok_or("--input-index requires a path")?, + )); + } + "--reference-file" => { + options.reference_file = Some(PathBuf::from( + iter.next().ok_or("--reference-file requires a path")?, + )); + } + "--reference-index" => { + options.reference_index = Some(PathBuf::from( + iter.next().ok_or("--reference-index requires a path")?, + )); + } + other if path.is_none() => { + path = Some(PathBuf::from(other)); + } + other => { + return Err(format!("unexpected argument: {other}")); + } + } + } + + let Some(path) = path else { + return Err( + "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + .to_owned(), + ); + }; + + let inspection = inspect_file(&path, &options).map_err(|err| err.to_string())?; + println!("{}", inspection.render_text()); + Ok(()) +} + +pub(crate) fn run_validate_variants(args: Vec) -> Result<(), String> { + run_validation_command( + args, + "usage: bioscript validate-variants [--report ]", + validate_variants_path, + ) +} + +pub(crate) fn run_validate_panels(args: Vec) -> Result<(), String> { + run_validation_command( + args, + "usage: bioscript validate-panels [--report ]", + validate_panels_path, + ) +} + +fn run_validation_command(args: Vec, usage: &str, validate: F) -> Result<(), String> +where + F: FnOnce(&std::path::Path) -> Result, +{ + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err(usage.to_owned()); + }; + + let report = validate(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 077ebd2..3c0a73c 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -1,24 +1,24 @@ use std::{ - collections::BTreeMap, env, - fmt::Write as _, - fs, - path::{Path, PathBuf}, + path::PathBuf, process::ExitCode, time::{Duration, Instant}, }; use bioscript_formats::{ - GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InspectOptions, PrepareRequest, - inspect_file, prepare_indexes, shell_flags, + GenotypeLoadOptions, GenotypeSourceFormat, PrepareRequest, prepare_indexes, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; -use bioscript_schema::{ - PanelManifest, VariantManifest, load_panel_manifest, load_variant_manifest, - validate_panels_path, validate_variants_path, -}; use monty::ResourceLimits; +mod commands; +mod manifest; +mod paths; + +use commands::{run_inspect, run_prepare, run_validate_panels, run_validate_variants}; +use manifest::{ManifestRunOptions, is_yaml_manifest, run_manifest}; +use paths::{normalize_loader_paths, write_timing_report}; + fn main() -> ExitCode { match run_cli() { Ok(()) => ExitCode::SUCCESS, @@ -285,592 +285,17 @@ fn run_cli_args(raw_args: Vec) -> Result<(), String> { Ok(()) } -fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { - if let Some(parent) = path.parent() { - fs::create_dir_all(parent).map_err(|err| { - format!( - "failed to create timing report dir {}: {err}", - parent.display() - ) - })?; - } - let mut output = String::from("stage\tduration_ms\tdetail\n"); - for timing in timings { - let _ = writeln!( - output, - "{}\t{}\t{}", - timing.stage, - timing.duration_ms, - timing.detail.replace('\t', " ") - ); - } - fs::write(path, output) - .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) -} - -fn run_prepare(args: Vec) -> Result<(), String> { - let mut root: Option = None; - let mut input_file: Option = None; - let mut reference_file: Option = None; - let mut input_format: Option = None; - let mut cache_dir: Option = None; - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - match arg.as_str() { - "--root" => { - root = Some(PathBuf::from( - iter.next().ok_or("--root requires a directory")?, - )); - } - "--input-file" => { - input_file = Some(iter.next().ok_or("--input-file requires a path")?); - } - "--reference-file" => { - reference_file = Some(iter.next().ok_or("--reference-file requires a path")?); - } - "--input-format" => { - let value = iter.next().ok_or("--input-format requires a value")?; - if !value.eq_ignore_ascii_case("auto") { - input_format = Some( - value - .parse::() - .map_err(|err| format!("invalid --input-format: {err}"))?, - ); - } - } - "--cache-dir" => { - cache_dir = Some(PathBuf::from( - iter.next().ok_or("--cache-dir requires a path")?, - )); - } - other => { - return Err(format!("unexpected argument: {other}")); - } - } - } - - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let effective_root = root.unwrap_or_else(|| cwd.clone()); - let effective_cache = cache_dir.unwrap_or_else(|| cwd.join(".bioscript-cache")); - - let request = PrepareRequest { - root: effective_root, - cwd, - cache_dir: effective_cache, - input_file, - input_format, - reference_file, - }; - - let prepared = prepare_indexes(&request)?; - - // print the flags that should be passed to a subsequent bioscript run - let flags = shell_flags(&prepared); - if flags.is_empty() { - eprintln!("bioscript prepare: nothing to index"); - } else { - println!("{flags}"); - } - - Ok(()) -} - -fn run_inspect(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut options = InspectOptions::default(); - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - match arg.as_str() { - "--input-index" => { - options.input_index = Some(PathBuf::from( - iter.next().ok_or("--input-index requires a path")?, - )); - } - "--reference-file" => { - options.reference_file = Some(PathBuf::from( - iter.next().ok_or("--reference-file requires a path")?, - )); - } - "--reference-index" => { - options.reference_index = Some(PathBuf::from( - iter.next().ok_or("--reference-index requires a path")?, - )); - } - other if path.is_none() => { - path = Some(PathBuf::from(other)); - } - other => { - return Err(format!("unexpected argument: {other}")); - } - } - } - - let Some(path) = path else { - return Err( - "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" - .to_owned(), - ); - }; - - let inspection = inspect_file(&path, &options).map_err(|err| err.to_string())?; - println!("{}", inspection.render_text()); - Ok(()) -} - -fn run_validate_variants(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut report_path: Option = None; - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - if arg == "--report" { - let Some(value) = iter.next() else { - return Err("--report requires a path".to_owned()); - }; - report_path = Some(PathBuf::from(value)); - } else if path.is_none() { - path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); - } - } - - let Some(path) = path else { - return Err("usage: bioscript validate-variants [--report ]".to_owned()); - }; - - let report = validate_variants_path(&path)?; - let text = report.render_text(); - print!("{text}"); - - if let Some(report_path) = report_path { - if let Some(parent) = report_path.parent() { - std::fs::create_dir_all(parent).map_err(|err| { - format!("failed to create report dir {}: {err}", parent.display()) - })?; - } - std::fs::write(&report_path, text) - .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; - } - - if report.has_errors() { - return Err(format!( - "validation found {} errors and {} warnings", - report.total_errors(), - report.total_warnings() - )); - } - - Ok(()) -} - -fn run_validate_panels(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut report_path: Option = None; - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - if arg == "--report" { - let Some(value) = iter.next() else { - return Err("--report requires a path".to_owned()); - }; - report_path = Some(PathBuf::from(value)); - } else if path.is_none() { - path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); - } - } - - let Some(path) = path else { - return Err("usage: bioscript validate-panels [--report ]".to_owned()); - }; - - let report = validate_panels_path(&path)?; - let text = report.render_text(); - print!("{text}"); - - if let Some(report_path) = report_path { - if let Some(parent) = report_path.parent() { - std::fs::create_dir_all(parent).map_err(|err| { - format!("failed to create report dir {}: {err}", parent.display()) - })?; - } - std::fs::write(&report_path, text) - .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; - } - - if report.has_errors() { - return Err(format!( - "validation found {} errors and {} warnings", - report.total_errors(), - report.total_warnings() - )); - } - - Ok(()) -} - -fn is_yaml_manifest(path: &Path) -> bool { - path.extension() - .and_then(|ext| ext.to_str()) - .is_some_and(|ext| matches!(ext, "yaml" | "yml")) -} - -struct ManifestRunOptions<'a> { - input_file: Option<&'a str>, - output_file: Option<&'a str>, - participant_id: Option<&'a str>, - trace_report: Option<&'a Path>, - loader: &'a GenotypeLoadOptions, - filters: &'a [String], -} - -fn run_manifest( - runtime_root: &Path, - manifest_path: &Path, - options: &ManifestRunOptions<'_>, -) -> Result<(), String> { - let schema = manifest_schema(manifest_path)?; - let resolved_input = options - .input_file - .map(|value| resolve_cli_path(runtime_root, value)); - let resolved_output = options - .output_file - .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); - let resolved_trace = options - .trace_report - .map(|value| resolve_cli_path_buf(runtime_root, value)); - match schema.as_str() { - "bioscript:variant:1.0" | "bioscript:variant" => { - let manifest = load_variant_manifest(manifest_path)?; - let row = run_variant_manifest( - runtime_root, - &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - )?; - write_manifest_outputs( - std::slice::from_ref(&row), - resolved_output.as_deref(), - resolved_trace.as_deref(), - )?; - Ok(()) - } - "bioscript:panel:1.0" => { - let manifest = load_panel_manifest(manifest_path)?; - let rows = run_panel_manifest( - runtime_root, - &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - options.filters, - )?; - write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; - Ok(()) - } - other => Err(format!("unsupported manifest schema '{other}'")), - } -} - -fn run_variant_manifest( - runtime_root: &Path, - manifest: &VariantManifest, - input_file: Option<&str>, - participant_id: Option<&str>, - loader: &GenotypeLoadOptions, -) -> Result, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - Ok(variant_row( - runtime_root, - &manifest.path, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )) -} - -fn run_panel_manifest( - runtime_root: &Path, - panel: &PanelManifest, - input_file: Option<&str>, - participant_id: Option<&str>, - loader: &GenotypeLoadOptions, - filters: &[String], -) -> Result>, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let mut rows = Vec::new(); - - for member in &panel.members { - if member.kind != "variant" { - return Err(format!( - "panel member kind '{}' is not executable yet; panel execution is currently variant-only", - member.kind - )); - } - let Some(path) = &member.path else { - return Err("remote panel members are not executable yet".to_owned()); - }; - let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; - let manifest = load_variant_manifest(&resolved)?; - if !matches_filters(&manifest, &resolved, filters) { - continue; - } - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - rows.push(variant_row( - runtime_root, - &resolved, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )); - } - - Ok(rows) -} - -fn variant_row( - runtime_root: &Path, - path: &Path, - name: &str, - tags: &[String], - observation: &bioscript_core::VariantObservation, - participant_id: Option<&str>, -) -> BTreeMap { - let mut row = BTreeMap::new(); - row.insert("kind".to_owned(), "variant".to_owned()); - row.insert("name".to_owned(), name.to_owned()); - row.insert( - "path".to_owned(), - path.strip_prefix(runtime_root) - .unwrap_or(path) - .display() - .to_string(), - ); - row.insert("tags".to_owned(), tags.join(",")); - row.insert("backend".to_owned(), observation.backend.clone()); - row.insert( - "participant_id".to_owned(), - participant_id.unwrap_or_default().to_owned(), - ); - row.insert( - "matched_rsid".to_owned(), - observation.matched_rsid.clone().unwrap_or_default(), - ); - row.insert( - "assembly".to_owned(), - observation - .assembly - .map(|value| match value { - bioscript_core::Assembly::Grch37 => "grch37".to_owned(), - bioscript_core::Assembly::Grch38 => "grch38".to_owned(), - }) - .unwrap_or_default(), - ); - row.insert( - "genotype".to_owned(), - observation.genotype.clone().unwrap_or_default(), - ); - row.insert( - "ref_count".to_owned(), - observation - .ref_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "alt_count".to_owned(), - observation - .alt_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "depth".to_owned(), - observation - .depth - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert("evidence".to_owned(), observation.evidence.join(" | ")); - row -} - -fn write_manifest_outputs( - rows: &[BTreeMap], - output_file: Option<&Path>, - trace_report: Option<&Path>, -) -> Result<(), String> { - let text = render_rows_as_tsv(rows); - if let Some(output_file) = output_file { - if let Some(parent) = output_file.parent() { - fs::create_dir_all(parent).map_err(|err| { - format!("failed to create output dir {}: {err}", parent.display()) - })?; - } - fs::write(output_file, &text) - .map_err(|err| format!("failed to write output {}: {err}", output_file.display()))?; - } else { - print!("{text}"); - } - - if let Some(trace_report) = trace_report { - if let Some(parent) = trace_report.parent() { - fs::create_dir_all(parent) - .map_err(|err| format!("failed to create trace dir {}: {err}", parent.display()))?; - } - let mut trace = String::from("step\tline\tcode\n"); - for (idx, row) in rows.iter().enumerate() { - let _ = writeln!( - trace, - "{}\t{}\t{}", - idx + 1, - idx + 1, - row.get("path").cloned().unwrap_or_default() - ); - } - fs::write(trace_report, trace) - .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; - } - - Ok(()) -} - -fn resolve_cli_path(root: &Path, value: &str) -> String { - resolve_cli_path_buf(root, Path::new(value)) - .display() - .to_string() -} - -fn resolve_cli_path_buf(root: &Path, value: &Path) -> PathBuf { - if value.is_absolute() { - value.to_path_buf() - } else { - root.join(value) - } -} - -fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { - let headers = [ - "kind", - "name", - "path", - "tags", - "participant_id", - "backend", - "matched_rsid", - "assembly", - "genotype", - "ref_count", - "alt_count", - "depth", - "evidence", - ]; - let mut out = headers.join("\t"); - out.push('\n'); - for row in rows { - let line = headers - .iter() - .map(|header| { - row.get(*header) - .cloned() - .unwrap_or_default() - .replace('\t', " ") - }) - .collect::>() - .join("\t"); - out.push_str(&line); - out.push('\n'); - } - out -} - -fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { - filters.iter().all(|filter| match filter.split_once('=') { - Some(("kind", value)) => value == "variant", - Some(("name", value)) => manifest.name.contains(value), - Some(("path", value)) => path.display().to_string().contains(value), - Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), - Some(_) | None => false, - }) -} - -fn resolve_manifest_path( - runtime_root: &Path, - manifest_path: &Path, - relative: &str, -) -> Result { - let base_dir = manifest_path - .parent() - .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; - let joined = base_dir.join(relative); - let canonical_root = runtime_root - .canonicalize() - .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; - let canonical_base = base_dir.canonicalize().map_err(|err| { - format!( - "failed to resolve manifest dir {}: {err}", - base_dir.display() - ) - })?; - let canonical_joined = joined - .canonicalize() - .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; - let boundary = if canonical_base.starts_with(&canonical_root) { - &canonical_root - } else { - &canonical_base - }; - if !canonical_joined.starts_with(boundary) { - return Err(format!( - "manifest member path escapes bioscript root: {}", - canonical_joined.display() - )); - } - Ok(canonical_joined) -} - -fn manifest_schema(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read {}: {err}", path.display()))?; - let value: serde_yaml::Value = serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; - value - .as_mapping() - .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) - .ok_or_else(|| format!("{} is missing schema", path.display())) -} - -fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { - if let Some(path) = loader.input_index.take() { - loader.input_index = Some(resolve_cli_path_buf(root, &path)); - } - if let Some(path) = loader.reference_file.take() { - loader.reference_file = Some(resolve_cli_path_buf(root, &path)); - } - if let Some(path) = loader.reference_index.take() { - loader.reference_index = Some(resolve_cli_path_buf(root, &path)); - } -} - #[cfg(test)] mod tests { use super::*; + use crate::manifest::{ + manifest_schema, matches_filters, render_rows_as_tsv, resolve_manifest_path, variant_row, + }; + use crate::paths::{normalize_loader_paths, resolve_cli_path, resolve_cli_path_buf}; use bioscript_core::{Assembly, VariantObservation}; + use bioscript_schema::VariantManifest; + use std::fs; + use std::path::Path; use std::time::{SystemTime, UNIX_EPOCH}; fn temp_dir(label: &str) -> PathBuf { diff --git a/rust/bioscript-cli/src/manifest.rs b/rust/bioscript-cli/src/manifest.rs new file mode 100644 index 0000000..03cf9ae --- /dev/null +++ b/rust/bioscript-cli/src/manifest.rs @@ -0,0 +1,336 @@ +use std::{collections::BTreeMap, fmt::Write as _, fs, path::Path}; + +use bioscript_formats::{GenotypeLoadOptions, GenotypeStore}; +use bioscript_schema::{ + PanelManifest, VariantManifest, load_panel_manifest, load_variant_manifest, +}; + +use crate::paths::{resolve_cli_path, resolve_cli_path_buf}; + +pub(crate) struct ManifestRunOptions<'a> { + pub(crate) input_file: Option<&'a str>, + pub(crate) output_file: Option<&'a str>, + pub(crate) participant_id: Option<&'a str>, + pub(crate) trace_report: Option<&'a Path>, + pub(crate) loader: &'a GenotypeLoadOptions, + pub(crate) filters: &'a [String], +} + +pub(crate) fn is_yaml_manifest(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| matches!(ext, "yaml" | "yml")) +} + +pub(crate) fn run_manifest( + runtime_root: &Path, + manifest_path: &Path, + options: &ManifestRunOptions<'_>, +) -> Result<(), String> { + let schema = manifest_schema(manifest_path)?; + let resolved_input = options + .input_file + .map(|value| resolve_cli_path(runtime_root, value)); + let resolved_output = options + .output_file + .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); + let resolved_trace = options + .trace_report + .map(|value| resolve_cli_path_buf(runtime_root, value)); + match schema.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + let row = run_variant_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + )?; + write_manifest_outputs( + std::slice::from_ref(&row), + resolved_output.as_deref(), + resolved_trace.as_deref(), + )?; + Ok(()) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let rows = run_panel_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + options.filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_variant_manifest( + runtime_root: &Path, + manifest: &VariantManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, +) -> Result, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + Ok(variant_row( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )) +} + +fn run_panel_manifest( + runtime_root: &Path, + panel: &PanelManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let mut rows = Vec::new(); + + for member in &panel.members { + if member.kind != "variant" { + return Err(format!( + "panel member kind '{}' is not executable yet; panel execution is currently variant-only", + member.kind + )); + } + let Some(path) = &member.path else { + return Err("remote panel members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + rows.push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } + + Ok(rows) +} + +pub(crate) fn variant_row( + runtime_root: &Path, + path: &Path, + name: &str, + tags: &[String], + observation: &bioscript_core::VariantObservation, + participant_id: Option<&str>, +) -> BTreeMap { + let mut row = BTreeMap::new(); + row.insert("kind".to_owned(), "variant".to_owned()); + row.insert("name".to_owned(), name.to_owned()); + row.insert( + "path".to_owned(), + path.strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string(), + ); + row.insert("tags".to_owned(), tags.join(",")); + row.insert("backend".to_owned(), observation.backend.clone()); + row.insert( + "participant_id".to_owned(), + participant_id.unwrap_or_default().to_owned(), + ); + row.insert( + "matched_rsid".to_owned(), + observation.matched_rsid.clone().unwrap_or_default(), + ); + row.insert( + "assembly".to_owned(), + observation + .assembly + .map(|value| match value { + bioscript_core::Assembly::Grch37 => "grch37".to_owned(), + bioscript_core::Assembly::Grch38 => "grch38".to_owned(), + }) + .unwrap_or_default(), + ); + row.insert( + "genotype".to_owned(), + observation.genotype.clone().unwrap_or_default(), + ); + row.insert( + "ref_count".to_owned(), + observation + .ref_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "alt_count".to_owned(), + observation + .alt_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "depth".to_owned(), + observation + .depth + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert("evidence".to_owned(), observation.evidence.join(" | ")); + row +} + +fn write_manifest_outputs( + rows: &[BTreeMap], + output_file: Option<&Path>, + trace_report: Option<&Path>, +) -> Result<(), String> { + let text = render_rows_as_tsv(rows); + if let Some(output_file) = output_file { + if let Some(parent) = output_file.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!("failed to create output dir {}: {err}", parent.display()) + })?; + } + fs::write(output_file, &text) + .map_err(|err| format!("failed to write output {}: {err}", output_file.display()))?; + } else { + print!("{text}"); + } + + if let Some(trace_report) = trace_report { + if let Some(parent) = trace_report.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create trace dir {}: {err}", parent.display()))?; + } + let mut trace = String::from("step\tline\tcode\n"); + for (idx, row) in rows.iter().enumerate() { + let _ = writeln!( + trace, + "{}\t{}\t{}", + idx + 1, + idx + 1, + row.get("path").cloned().unwrap_or_default() + ); + } + fs::write(trace_report, trace) + .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; + } + + Ok(()) +} + +pub(crate) fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { + let headers = [ + "kind", + "name", + "path", + "tags", + "participant_id", + "backend", + "matched_rsid", + "assembly", + "genotype", + "ref_count", + "alt_count", + "depth", + "evidence", + ]; + let mut out = headers.join("\t"); + out.push('\n'); + for row in rows { + let line = headers + .iter() + .map(|header| { + row.get(*header) + .cloned() + .unwrap_or_default() + .replace('\t', " ") + }) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + out +} + +pub(crate) fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("kind", value)) => value == "variant", + Some(("name", value)) => manifest.name.contains(value), + Some(("path", value)) => path.display().to_string().contains(value), + Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), + Some(_) | None => false, + }) +} + +pub(crate) fn resolve_manifest_path( + runtime_root: &Path, + manifest_path: &Path, + relative: &str, +) -> Result { + let base_dir = manifest_path + .parent() + .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; + let joined = base_dir.join(relative); + let canonical_root = runtime_root + .canonicalize() + .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; + let canonical_base = base_dir.canonicalize().map_err(|err| { + format!( + "failed to resolve manifest dir {}: {err}", + base_dir.display() + ) + })?; + let canonical_joined = joined + .canonicalize() + .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; + let boundary = if canonical_base.starts_with(&canonical_root) { + &canonical_root + } else { + &canonical_base + }; + if !canonical_joined.starts_with(boundary) { + return Err(format!( + "manifest member path escapes bioscript root: {}", + canonical_joined.display() + )); + } + Ok(canonical_joined) +} + +pub(crate) fn manifest_schema(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; + value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{} is missing schema", path.display())) +} diff --git a/rust/bioscript-cli/src/paths.rs b/rust/bioscript-cli/src/paths.rs new file mode 100644 index 0000000..2df0400 --- /dev/null +++ b/rust/bioscript-cli/src/paths.rs @@ -0,0 +1,57 @@ +use std::{ + fmt::Write as _, + fs, + path::{Path, PathBuf}, +}; + +use bioscript_formats::GenotypeLoadOptions; +use bioscript_runtime::StageTiming; + +pub(crate) fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!( + "failed to create timing report dir {}: {err}", + parent.display() + ) + })?; + } + let mut output = String::from("stage\tduration_ms\tdetail\n"); + for timing in timings { + let _ = writeln!( + output, + "{}\t{}\t{}", + timing.stage, + timing.duration_ms, + timing.detail.replace('\t', " ") + ); + } + fs::write(path, output) + .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) +} + +pub(crate) fn resolve_cli_path(root: &Path, value: &str) -> String { + resolve_cli_path_buf(root, Path::new(value)) + .display() + .to_string() +} + +pub(crate) fn resolve_cli_path_buf(root: &Path, value: &Path) -> PathBuf { + if value.is_absolute() { + value.to_path_buf() + } else { + root.join(value) + } +} + +pub(crate) fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { + if let Some(path) = loader.input_index.take() { + loader.input_index = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_file.take() { + loader.reference_file = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_index.take() { + loader.reference_index = Some(resolve_cli_path_buf(root, &path)); + } +} diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 927882f..13a3bb4 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -44,976 +44,11 @@ fn stderr_text(output: &Output) -> String { String::from_utf8_lossy(&output.stderr).into_owned() } -#[test] -fn cli_reports_usage_when_no_script_or_subcommand_is_provided() { - let root = repo_root(); - - let output = run_bioscript(&root, std::iter::empty::<&str>()); - - assert!(!output.status.success()); - let stderr = stderr_text(&output); - assert!(stderr.contains("usage: bioscript"), "{stderr}"); - assert!(stderr.contains("validate-variants"), "{stderr}"); - assert!(stderr.contains("inspect "), "{stderr}"); -} - -#[test] -fn cli_rejects_missing_values_and_unexpected_arguments() { - let root = repo_root(); - - for (args, expected) in [ - (vec!["--root"], "--root requires a directory"), - (vec!["--input-file"], "--input-file requires a path"), - (vec!["--output-file"], "--output-file requires a path"), - ( - vec!["--participant-id"], - "--participant-id requires a value", - ), - (vec!["--trace-report"], "--trace-report requires a path"), - (vec!["--timing-report"], "--timing-report requires a path"), - (vec!["--filter"], "--filter requires key=value"), - (vec!["--input-index"], "--input-index requires a path"), - (vec!["--reference-file"], "--reference-file requires a path"), - ( - vec!["--reference-index"], - "--reference-index requires a path", - ), - (vec!["--cache-dir"], "--cache-dir requires a path"), - ( - vec!["bioscripts/hello-world.py", "extra"], - "unexpected argument: extra", - ), - (vec!["inspect"], "usage: bioscript inspect"), - ( - vec!["inspect", "bioscripts/hello-world.py", "extra"], - "unexpected argument: extra", - ), - ( - vec!["inspect", "--input-index"], - "--input-index requires a path", - ), - ( - vec!["inspect", "--reference-file"], - "--reference-file requires a path", - ), - ( - vec!["inspect", "--reference-index"], - "--reference-index requires a path", - ), - (vec!["prepare", "--root"], "--root requires a directory"), - ( - vec!["prepare", "--input-file"], - "--input-file requires a path", - ), - ( - vec!["prepare", "--reference-file"], - "--reference-file requires a path", - ), - (vec!["prepare", "extra"], "unexpected argument: extra"), - ( - vec!["prepare", "--cache-dir"], - "--cache-dir requires a path", - ), - ( - vec!["validate-variants", "one.yaml", "two.yaml"], - "unexpected argument: two.yaml", - ), - ( - vec!["validate-variants", "--report"], - "--report requires a path", - ), - ( - vec!["validate-panels", "one.yaml", "two.yaml"], - "unexpected argument: two.yaml", - ), - ( - vec!["validate-panels", "--report"], - "--report requires a path", - ), - ] { - let output = run_bioscript(&root, args); - assert!(!output.status.success(), "expected failure for {expected}"); - let stderr = stderr_text(&output); - assert!(stderr.contains(expected), "{stderr}"); - } -} - -#[test] -fn cli_accepts_auto_format_and_explicit_loader_paths_for_script_runs() { - let root = repo_root(); - let dir = temp_dir("loader-args"); - fs::write( - dir.join("script.py"), - r#" -def main(): - print("loader args accepted") - -if __name__ == "__main__": - main() -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--root") - .arg(&dir) - .arg("--input-format") - .arg("auto") - .arg("--input-index") - .arg("input.crai") - .arg("--reference-file") - .arg("ref.fa") - .arg("--reference-index") - .arg("ref.fa.fai") - .arg("--allow-md5-mismatch") - .arg(dir.join("script.py")) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - assert!( - String::from_utf8_lossy(&output.stdout).contains("loader args accepted"), - "stdout: {}", - String::from_utf8_lossy(&output.stdout) - ); -} - -#[test] -fn cli_rejects_invalid_numeric_limits_and_input_formats() { - let root = repo_root(); - - for (args, expected) in [ - ( - vec!["--input-format", "bam", "bioscripts/hello-world.py"], - "invalid --input-format value bam", - ), - ( - vec!["--max-duration-ms", "soon", "bioscripts/hello-world.py"], - "invalid --max-duration-ms value soon", - ), - ( - vec!["--max-memory-bytes", "large", "bioscripts/hello-world.py"], - "invalid --max-memory-bytes value large", - ), - ( - vec!["--max-allocations", "many", "bioscripts/hello-world.py"], - "invalid --max-allocations value many", - ), - ( - vec!["--max-recursion-depth", "deep", "bioscripts/hello-world.py"], - "invalid --max-recursion-depth value deep", - ), - ( - vec!["prepare", "--input-format", "bam"], - "invalid --input-format: unsupported input format: bam", - ), - ] { - let output = run_bioscript(&root, args); - assert!(!output.status.success(), "expected failure for {expected}"); - let stderr = stderr_text(&output); - assert!(stderr.contains(expected), "{stderr}"); - } -} - -#[test] -fn cli_rejects_unsupported_manifest_schema() { - let root = repo_root(); - let dir = temp_dir("unsupported-manifest"); - let manifest = dir.join("unsupported.yaml"); - fs::write( - &manifest, - r#" -schema: "bioscript:catalogue:1.0" -version: "1.0" -name: "catalogue" -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg(&manifest) - .output() - .unwrap(); - - assert!(!output.status.success()); - let stderr = stderr_text(&output); - assert!( - stderr.contains("unsupported manifest schema 'bioscript:catalogue:1.0'"), - "{stderr}" - ); -} - -#[test] -fn hello_world_script_runs_via_cli_and_writes_within_root() { - let root = repo_root(); - let output_path = root.join("bioscripts/output/hello-world.txt"); - if output_path.exists() { - fs::remove_file(&output_path).unwrap(); - } - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("bioscripts/hello-world.py") - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("hello from bioscript")); - assert!(stdout.contains("2 + 3 = 5")); - - let written = fs::read_to_string(output_path).unwrap(); - assert!(written.contains("hello from bioscript")); - assert!(written.contains("loaded: sample input for bioscript")); -} - -#[test] -fn path_escape_is_rejected() { - let root = repo_root(); - let script = root.join("rust/bioscript-cli/tests/fixtures/path_escape.py"); - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg(script) - .output() - .unwrap(); - - assert!(!output.status.success()); - let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stderr.contains("path escapes bioscript root")); -} - -#[test] -fn trace_report_is_written_for_hello_world() { - let root = repo_root(); - let trace_path = root.join("bioscripts/output/hello-world.trace.tsv"); - if trace_path.exists() { - fs::remove_file(&trace_path).unwrap(); - } - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--trace-report") - .arg("bioscripts/output/hello-world.trace.tsv") - .arg("bioscripts/hello-world.py") - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let trace = fs::read_to_string(trace_path).unwrap(); - assert!(trace.contains("step\tline\tcode")); - assert!(trace.contains("hello from bioscript")); -} - -#[test] -fn timing_report_is_written_for_hello_world() { - let root = repo_root(); - let timing_path = root.join("bioscripts/output/hello-world.timing.tsv"); - if timing_path.exists() { - fs::remove_file(&timing_path).unwrap(); - } - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--timing-report") - .arg("bioscripts/output/hello-world.timing.tsv") - .arg("bioscripts/hello-world.py") - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let timing = fs::read_to_string(timing_path).unwrap(); - assert!(timing.contains("stage\tduration_ms\tdetail")); - assert!(timing.contains("run_file_total\t")); - assert!(timing.contains("script=bioscripts/hello-world.py")); -} - -#[test] -fn auto_index_adds_reference_index_timing_for_script_runs() { - let root = repo_root(); - let dir = temp_dir("auto-index-script"); - let cache_dir = dir.join("cache"); - let timing_path = dir.join("reports/timing.tsv"); - fs::write(dir.join("ref.fa"), b">chr1\nACGT\n").unwrap(); - fs::write( - dir.join("script.py"), - r#" -def main(): - print("indexed") - - -if __name__ == "__main__": - main() -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--root") - .arg(&dir) - .arg("--reference-file") - .arg("ref.fa") - .arg("--auto-index") - .arg("--cache-dir") - .arg(&cache_dir) - .arg("--timing-report") - .arg(&timing_path) - .arg(dir.join("script.py")) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stderr = String::from_utf8_lossy(&output.stderr); - assert!( - stderr.contains("bioscript: auto-indexed reference ->"), - "{stderr}" - ); - assert!(fs::read_dir(&cache_dir).unwrap().any(|entry| { - entry - .unwrap() - .path() - .extension() - .is_some_and(|ext| ext == "fai") - })); - let timing = fs::read_to_string(timing_path).unwrap(); - assert!(timing.contains("auto_index\t"), "{timing}"); - assert!(timing.contains("run_file_total\t"), "{timing}"); -} - -#[test] -fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { - let root = repo_root(); - let script = root.join("rust/bioscript-cli/tests/fixtures/batch_lookup.py"); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg(script) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("AG")); - assert!(stdout.contains("TC")); - assert!(stdout.contains("II")); -} - -#[test] -fn lookup_variant_details_returns_counts_and_decision_fields() { - let root = repo_root(); - let script = root.join("rust/bioscript-cli/tests/fixtures/lookup_details.py"); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg(script) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("VariantObservation")); - assert!(stdout.contains("genotype='AG'")); - assert!(stdout.contains("raw_counts={")); - assert!(stdout.contains("decision=")); - assert!(stdout.contains("evidence=[")); -} - -#[test] -fn inspect_subcommand_reports_detected_vendor_and_platform() { - let root = repo_root(); - let path = root.join("rust/bioscript-formats/tests/fixtures/ancestrydna_v2_sample.txt"); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("inspect") - .arg(path) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("kind\tgenotype_text")); - assert!(stdout.contains("vendor\tAncestryDNA")); - assert!(stdout.contains("platform_version\tV2.0")); - assert!(stdout.contains("assembly\tgrch37")); - assert!(stdout.contains("duration_ms\t")); -} - -#[test] -fn prepare_subcommand_reports_reference_index_flags() { - let root = repo_root(); - let dir = temp_dir("prepare-cli"); - let reference = dir.join("ref.fa"); - fs::write(&reference, b">chr1\nACGT\n").unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("prepare") - .arg("--root") - .arg(&dir) - .arg("--reference-file") - .arg("ref.fa") - .arg("--cache-dir") - .arg("cache") - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("--reference-file")); - assert!(stdout.contains("--reference-index")); - assert!(stdout.contains("cache")); -} - -#[test] -fn prepare_subcommand_reports_nothing_to_index_for_noop_auto_request() { - let root = repo_root(); - let dir = temp_dir("prepare-noop-cli"); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("prepare") - .arg("--root") - .arg(&dir) - .arg("--input-format") - .arg("auto") - .output() - .unwrap(); - - assert!(output.status.success(), "stderr: {}", stderr_text(&output)); - assert!(String::from_utf8_lossy(&output.stdout).is_empty()); - assert!( - stderr_text(&output).contains("bioscript prepare: nothing to index"), - "{}", - stderr_text(&output) - ); -} - -#[test] -fn validate_variants_cli_returns_nonzero_and_writes_report() { - let root = repo_root(); - let dir = temp_dir("validate-variants-cli"); - let manifest = dir.join("bad-variant.yaml"); - let report = dir.join("reports/variants.txt"); - fs::write( - &manifest, - r#" -schema: "bioscript:variant" -version: "1.0" -variant_id: "TEST_bad" -name: "bad" -identifiers: - rsids: - - "bad-rsid" -coordinates: - grch38: - chrom: "chrUn" - pos: 0 -alleles: - kind: "snv" - ref: "AA" - alts: [] -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("validate-variants") - .arg(&manifest) - .arg("--report") - .arg(&report) - .output() - .unwrap(); - - assert!(!output.status.success()); - let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stderr.contains("validation found"), "{stderr}"); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("errors"), "{stdout}"); - let report_text = fs::read_to_string(report).unwrap(); - assert!(report_text.contains("bad-rsid")); -} - -#[test] -fn validate_panels_cli_returns_nonzero_and_writes_report() { - let root = repo_root(); - let dir = temp_dir("validate-panels-cli"); - let panel = dir.join("bad-panel.yaml"); - let report = dir.join("reports/panels.txt"); - fs::write( - &panel, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "bad-panel" -members: - - kind: "variant" - path: "../outside.yaml" - sha256: "not-a-sha" -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("validate-panels") - .arg(&panel) - .arg("--report") - .arg(&report) - .output() - .unwrap(); - - assert!(!output.status.success()); - let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stderr.contains("validation found"), "{stderr}"); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("errors"), "{stdout}"); - let report_text = fs::read_to_string(report).unwrap(); - assert!(report_text.contains("members[0].sha256")); -} - -#[test] -fn variant_manifest_runs_directly_via_cli() { - let root = repo_root(); - let dir = temp_dir("variant-manifest"); - let manifest = dir.join("rs1.yaml"); - fs::write( - &manifest, - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "example-rs73885319" -tags: - - "type:trait" -identifiers: - rsids: - - "rs73885319" -coordinates: - grch38: - chrom: "22" - pos: 36265860 -alleles: - kind: "snv" - ref: "A" - alts: - - "G" -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg(&manifest) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("kind\tname\tpath")); - assert!(stdout.contains("example-rs73885319")); - assert!(stdout.contains("AG")); -} - -#[test] -fn variant_manifest_requires_input_file() { - let root = repo_root(); - let dir = temp_dir("variant-manifest-missing-input"); - let manifest = dir.join("rs1.yaml"); - fs::write( - &manifest, - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "example-rs1" -identifiers: - rsids: - - "rs1" -coordinates: - grch38: - chrom: "1" - pos: 10 -alleles: - kind: "snv" - ref: "A" - alts: ["G"] -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg(&manifest) - .output() - .unwrap(); - - assert!(!output.status.success()); - assert!( - stderr_text(&output).contains("manifest execution requires --input-file"), - "{}", - stderr_text(&output) - ); -} - -#[test] -fn variant_manifest_writes_output_trace_and_participant_id() { - let root = repo_root(); - let dir = temp_dir("variant-manifest-output"); - let manifest = dir.join("rs1.yaml"); - let output_path = dir.join("reports/variant.tsv"); - let trace_path = dir.join("reports/variant.trace.tsv"); - fs::write( - &manifest, - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "example-rs73885319" -tags: - - "type:trait" -identifiers: - rsids: - - "rs73885319" -coordinates: - grch38: - chrom: "22" - pos: 36265860 -alleles: - kind: "snv" - ref: "A" - alts: - - "G" -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg("--output-file") - .arg(&output_path) - .arg("--participant-id") - .arg("participant-1") - .arg("--trace-report") - .arg(&trace_path) - .arg(&manifest) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - assert!(String::from_utf8_lossy(&output.stdout).is_empty()); - let table = fs::read_to_string(output_path).unwrap(); - assert!(table.contains("participant-1"), "{table}"); - assert!(table.contains("example-rs73885319"), "{table}"); - let trace = fs::read_to_string(trace_path).unwrap(); - assert!(trace.contains("step\tline\tcode"), "{trace}"); - assert!(trace.contains("rs1.yaml"), "{trace}"); -} - -#[test] -fn panel_manifest_runs_directly_via_cli() { - let root = repo_root(); - let dir = temp_dir("panel-manifest"); - let variants_dir = dir.join("variants"); - fs::create_dir_all(&variants_dir).unwrap(); - fs::write( - variants_dir.join("rs73885319.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "example-rs73885319" -tags: - - "type:trait" -identifiers: - rsids: - - "rs73885319" -coordinates: - grch38: - chrom: "22" - pos: 36265860 -alleles: - kind: "snv" - ref: "A" - alts: - - "G" -"#, - ) - .unwrap(); - fs::write( - variants_dir.join("rs60910145.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "example-rs60910145" -tags: - - "type:trait" -identifiers: - rsids: - - "rs60910145" -coordinates: - grch38: - chrom: "22" - pos: 36265988 -alleles: - kind: "snv" - ref: "T" - alts: - - "G" -"#, - ) - .unwrap(); - let panel = dir.join("panel.yaml"); - fs::write( - &panel, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "example-panel" -tags: - - "type:trait" -members: - - kind: "variant" - path: "variants/rs73885319.yaml" - version: "1.0" - - kind: "variant" - path: "variants/rs60910145.yaml" - version: "1.0" -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg("--filter") - .arg("name=rs73885319") - .arg(&panel) - .output() - .unwrap(); - - assert!( - output.status.success(), - "stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains("example-rs73885319")); - assert!(!stdout.contains("example-rs60910145")); -} - -#[test] -fn panel_manifest_filters_by_kind_tag_path_and_rejects_unknown_filter_keys() { - let root = repo_root(); - let dir = temp_dir("panel-filters"); - let variants_dir = dir.join("variants"); - fs::create_dir_all(&variants_dir).unwrap(); - fs::write( - variants_dir.join("rs73885319.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "example-rs73885319" -tags: - - "type:trait" -identifiers: - rsids: - - "rs73885319" -coordinates: - grch38: - chrom: "22" - pos: 36265860 -alleles: - kind: "snv" - ref: "A" - alts: - - "G" -"#, - ) - .unwrap(); - let panel = dir.join("panel.yaml"); - fs::write( - &panel, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "example-panel" -members: - - kind: "variant" - path: "variants/rs73885319.yaml" - version: "1.0" -"#, - ) - .unwrap(); - - let matched = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg("--filter") - .arg("kind=variant") - .arg("--filter") - .arg("tag=type:trait") - .arg("--filter") - .arg("path=rs73885319") - .arg(&panel) - .output() - .unwrap(); - - assert!( - matched.status.success(), - "stderr: {}", - String::from_utf8_lossy(&matched.stderr) - ); - let stdout = String::from_utf8_lossy(&matched.stdout); - assert!(stdout.contains("example-rs73885319"), "{stdout}"); - - let filtered_out = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg("--filter") - .arg("unknown=value") - .arg(&panel) - .output() - .unwrap(); - - assert!( - filtered_out.status.success(), - "stderr: {}", - String::from_utf8_lossy(&filtered_out.stderr) - ); - let stdout = String::from_utf8_lossy(&filtered_out.stdout); - assert!(stdout.starts_with("kind\tname\tpath"), "{stdout}"); - assert!(!stdout.contains("example-rs73885319"), "{stdout}"); -} - -#[test] -fn panel_manifest_reports_remote_members_as_not_executable_yet() { - let root = repo_root(); - let dir = temp_dir("panel-remote-member"); - let panel = dir.join("panel.yaml"); - fs::write( - &panel, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "remote-panel" -permissions: - domains: - - "https://example.com" -downloads: - - id: "remote-rs73885319" - url: "https://example.com/rs73885319.yaml" - sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - version: "1.0" -members: - - kind: "variant" - download: "remote-rs73885319" - version: "1.0" -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg(&panel) - .output() - .unwrap(); - - assert!(!output.status.success()); - let stderr = String::from_utf8_lossy(&output.stderr); - assert!( - stderr.contains("remote panel members are not executable yet"), - "{stderr}" - ); -} - -#[test] -fn panel_manifest_reports_non_variant_members_as_not_executable_yet() { - let root = repo_root(); - let dir = temp_dir("panel-nonvariant-member"); - let panel = dir.join("panel.yaml"); - fs::write( - &panel, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "mixed-panel" -members: - - kind: "script" - path: "script.py" - version: "1.0" -"#, - ) - .unwrap(); - - let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(&root) - .arg("--input-file") - .arg("old/examples/apol1/test_snps.txt") - .arg(&panel) - .output() - .unwrap(); - - assert!(!output.status.success()); - assert!( - stderr_text(&output).contains("unsupported member kind 'script'"), - "{}", - stderr_text(&output) - ); -} +#[path = "cli/args.rs"] +mod args; +#[path = "cli/manifests.rs"] +mod manifests; +#[path = "cli/runtime.rs"] +mod runtime; +#[path = "cli/subcommands.rs"] +mod subcommands; diff --git a/rust/bioscript-cli/tests/cli/args.rs b/rust/bioscript-cli/tests/cli/args.rs new file mode 100644 index 0000000..15768a7 --- /dev/null +++ b/rust/bioscript-cli/tests/cli/args.rs @@ -0,0 +1,206 @@ +use super::*; + +#[test] +fn cli_reports_usage_when_no_script_or_subcommand_is_provided() { + let root = repo_root(); + + let output = run_bioscript(&root, std::iter::empty::<&str>()); + + assert!(!output.status.success()); + let stderr = stderr_text(&output); + assert!(stderr.contains("usage: bioscript"), "{stderr}"); + assert!(stderr.contains("validate-variants"), "{stderr}"); + assert!(stderr.contains("inspect "), "{stderr}"); +} + +#[test] +fn cli_rejects_missing_values_and_unexpected_arguments() { + let root = repo_root(); + + for (args, expected) in [ + (vec!["--root"], "--root requires a directory"), + (vec!["--input-file"], "--input-file requires a path"), + (vec!["--output-file"], "--output-file requires a path"), + ( + vec!["--participant-id"], + "--participant-id requires a value", + ), + (vec!["--trace-report"], "--trace-report requires a path"), + (vec!["--timing-report"], "--timing-report requires a path"), + (vec!["--filter"], "--filter requires key=value"), + (vec!["--input-index"], "--input-index requires a path"), + (vec!["--reference-file"], "--reference-file requires a path"), + ( + vec!["--reference-index"], + "--reference-index requires a path", + ), + (vec!["--cache-dir"], "--cache-dir requires a path"), + ( + vec!["bioscripts/hello-world.py", "extra"], + "unexpected argument: extra", + ), + (vec!["inspect"], "usage: bioscript inspect"), + ( + vec!["inspect", "bioscripts/hello-world.py", "extra"], + "unexpected argument: extra", + ), + ( + vec!["inspect", "--input-index"], + "--input-index requires a path", + ), + ( + vec!["inspect", "--reference-file"], + "--reference-file requires a path", + ), + ( + vec!["inspect", "--reference-index"], + "--reference-index requires a path", + ), + (vec!["prepare", "--root"], "--root requires a directory"), + ( + vec!["prepare", "--input-file"], + "--input-file requires a path", + ), + ( + vec!["prepare", "--reference-file"], + "--reference-file requires a path", + ), + (vec!["prepare", "extra"], "unexpected argument: extra"), + ( + vec!["prepare", "--cache-dir"], + "--cache-dir requires a path", + ), + ( + vec!["validate-variants", "one.yaml", "two.yaml"], + "unexpected argument: two.yaml", + ), + ( + vec!["validate-variants", "--report"], + "--report requires a path", + ), + ( + vec!["validate-panels", "one.yaml", "two.yaml"], + "unexpected argument: two.yaml", + ), + ( + vec!["validate-panels", "--report"], + "--report requires a path", + ), + ] { + let output = run_bioscript(&root, args); + assert!(!output.status.success(), "expected failure for {expected}"); + let stderr = stderr_text(&output); + assert!(stderr.contains(expected), "{stderr}"); + } +} + +#[test] +fn cli_accepts_auto_format_and_explicit_loader_paths_for_script_runs() { + let root = repo_root(); + let dir = temp_dir("loader-args"); + fs::write( + dir.join("script.py"), + r#" +def main(): + print("loader args accepted") + +if __name__ == "__main__": + main() +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--root") + .arg(&dir) + .arg("--input-format") + .arg("auto") + .arg("--input-index") + .arg("input.crai") + .arg("--reference-file") + .arg("ref.fa") + .arg("--reference-index") + .arg("ref.fa.fai") + .arg("--allow-md5-mismatch") + .arg(dir.join("script.py")) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + assert!( + String::from_utf8_lossy(&output.stdout).contains("loader args accepted"), + "stdout: {}", + String::from_utf8_lossy(&output.stdout) + ); +} + +#[test] +fn cli_rejects_invalid_numeric_limits_and_input_formats() { + let root = repo_root(); + + for (args, expected) in [ + ( + vec!["--input-format", "bam", "bioscripts/hello-world.py"], + "invalid --input-format value bam", + ), + ( + vec!["--max-duration-ms", "soon", "bioscripts/hello-world.py"], + "invalid --max-duration-ms value soon", + ), + ( + vec!["--max-memory-bytes", "large", "bioscripts/hello-world.py"], + "invalid --max-memory-bytes value large", + ), + ( + vec!["--max-allocations", "many", "bioscripts/hello-world.py"], + "invalid --max-allocations value many", + ), + ( + vec!["--max-recursion-depth", "deep", "bioscripts/hello-world.py"], + "invalid --max-recursion-depth value deep", + ), + ( + vec!["prepare", "--input-format", "bam"], + "invalid --input-format: unsupported input format: bam", + ), + ] { + let output = run_bioscript(&root, args); + assert!(!output.status.success(), "expected failure for {expected}"); + let stderr = stderr_text(&output); + assert!(stderr.contains(expected), "{stderr}"); + } +} + +#[test] +fn cli_rejects_unsupported_manifest_schema() { + let root = repo_root(); + let dir = temp_dir("unsupported-manifest"); + let manifest = dir.join("unsupported.yaml"); + fs::write( + &manifest, + r#" +schema: "bioscript:catalogue:1.0" +version: "1.0" +name: "catalogue" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg(&manifest) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = stderr_text(&output); + assert!( + stderr.contains("unsupported manifest schema 'bioscript:catalogue:1.0'"), + "{stderr}" + ); +} diff --git a/rust/bioscript-cli/tests/cli/manifests.rs b/rust/bioscript-cli/tests/cli/manifests.rs new file mode 100644 index 0000000..a14d379 --- /dev/null +++ b/rust/bioscript-cli/tests/cli/manifests.rs @@ -0,0 +1,404 @@ +use super::*; + +#[test] +fn variant_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("variant-manifest"); + let manifest = dir.join("rs1.yaml"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&manifest) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("kind\tname\tpath")); + assert!(stdout.contains("example-rs73885319")); + assert!(stdout.contains("AG")); +} + +#[test] +fn variant_manifest_requires_input_file() { + let root = repo_root(); + let dir = temp_dir("variant-manifest-missing-input"); + let manifest = dir.join("rs1.yaml"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs1" +identifiers: + rsids: + - "rs1" +coordinates: + grch38: + chrom: "1" + pos: 10 +alleles: + kind: "snv" + ref: "A" + alts: ["G"] +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg(&manifest) + .output() + .unwrap(); + + assert!(!output.status.success()); + assert!( + stderr_text(&output).contains("manifest execution requires --input-file"), + "{}", + stderr_text(&output) + ); +} + +#[test] +fn variant_manifest_writes_output_trace_and_participant_id() { + let root = repo_root(); + let dir = temp_dir("variant-manifest-output"); + let manifest = dir.join("rs1.yaml"); + let output_path = dir.join("reports/variant.tsv"); + let trace_path = dir.join("reports/variant.trace.tsv"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--output-file") + .arg(&output_path) + .arg("--participant-id") + .arg("participant-1") + .arg("--trace-report") + .arg(&trace_path) + .arg(&manifest) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + assert!(String::from_utf8_lossy(&output.stdout).is_empty()); + let table = fs::read_to_string(output_path).unwrap(); + assert!(table.contains("participant-1"), "{table}"); + assert!(table.contains("example-rs73885319"), "{table}"); + let trace = fs::read_to_string(trace_path).unwrap(); + assert!(trace.contains("step\tline\tcode"), "{trace}"); + assert!(trace.contains("rs1.yaml"), "{trace}"); +} + +#[test] +fn panel_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("panel-manifest"); + let variants_dir = dir.join("variants"); + fs::create_dir_all(&variants_dir).unwrap(); + fs::write( + variants_dir.join("rs73885319.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + fs::write( + variants_dir.join("rs60910145.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs60910145" +tags: + - "type:trait" +identifiers: + rsids: + - "rs60910145" +coordinates: + grch38: + chrom: "22" + pos: 36265988 +alleles: + kind: "snv" + ref: "T" + alts: + - "G" +"#, + ) + .unwrap(); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "example-panel" +tags: + - "type:trait" +members: + - kind: "variant" + path: "variants/rs73885319.yaml" + version: "1.0" + - kind: "variant" + path: "variants/rs60910145.yaml" + version: "1.0" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--filter") + .arg("name=rs73885319") + .arg(&panel) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("example-rs73885319")); + assert!(!stdout.contains("example-rs60910145")); +} + +#[test] +fn panel_manifest_filters_by_kind_tag_path_and_rejects_unknown_filter_keys() { + let root = repo_root(); + let dir = temp_dir("panel-filters"); + let variants_dir = dir.join("variants"); + fs::create_dir_all(&variants_dir).unwrap(); + fs::write( + variants_dir.join("rs73885319.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "example-panel" +members: + - kind: "variant" + path: "variants/rs73885319.yaml" + version: "1.0" +"#, + ) + .unwrap(); + + let matched = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--filter") + .arg("kind=variant") + .arg("--filter") + .arg("tag=type:trait") + .arg("--filter") + .arg("path=rs73885319") + .arg(&panel) + .output() + .unwrap(); + + assert!( + matched.status.success(), + "stderr: {}", + String::from_utf8_lossy(&matched.stderr) + ); + let stdout = String::from_utf8_lossy(&matched.stdout); + assert!(stdout.contains("example-rs73885319"), "{stdout}"); + + let filtered_out = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--filter") + .arg("unknown=value") + .arg(&panel) + .output() + .unwrap(); + + assert!( + filtered_out.status.success(), + "stderr: {}", + String::from_utf8_lossy(&filtered_out.stderr) + ); + let stdout = String::from_utf8_lossy(&filtered_out.stdout); + assert!(stdout.starts_with("kind\tname\tpath"), "{stdout}"); + assert!(!stdout.contains("example-rs73885319"), "{stdout}"); +} + +#[test] +fn panel_manifest_reports_remote_members_as_not_executable_yet() { + let root = repo_root(); + let dir = temp_dir("panel-remote-member"); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "remote-panel" +permissions: + domains: + - "https://example.com" +downloads: + - id: "remote-rs73885319" + url: "https://example.com/rs73885319.yaml" + sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + version: "1.0" +members: + - kind: "variant" + download: "remote-rs73885319" + version: "1.0" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&panel) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("remote panel members are not executable yet"), + "{stderr}" + ); +} + +#[test] +fn panel_manifest_reports_non_variant_members_as_not_executable_yet() { + let root = repo_root(); + let dir = temp_dir("panel-nonvariant-member"); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "mixed-panel" +members: + - kind: "script" + path: "script.py" + version: "1.0" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&panel) + .output() + .unwrap(); + + assert!(!output.status.success()); + assert!( + stderr_text(&output).contains("unsupported member kind 'script'"), + "{}", + stderr_text(&output) + ); +} diff --git a/rust/bioscript-cli/tests/cli/runtime.rs b/rust/bioscript-cli/tests/cli/runtime.rs new file mode 100644 index 0000000..c203222 --- /dev/null +++ b/rust/bioscript-cli/tests/cli/runtime.rs @@ -0,0 +1,204 @@ +use super::*; + +#[test] +fn hello_world_script_runs_via_cli_and_writes_within_root() { + let root = repo_root(); + let output_path = root.join("bioscripts/output/hello-world.txt"); + if output_path.exists() { + fs::remove_file(&output_path).unwrap(); + } + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("bioscripts/hello-world.py") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("hello from bioscript")); + assert!(stdout.contains("2 + 3 = 5")); + + let written = fs::read_to_string(output_path).unwrap(); + assert!(written.contains("hello from bioscript")); + assert!(written.contains("loaded: sample input for bioscript")); +} + +#[test] +fn path_escape_is_rejected() { + let root = repo_root(); + let script = root.join("rust/bioscript-cli/tests/fixtures/path_escape.py"); + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg(script) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("path escapes bioscript root")); +} + +#[test] +fn trace_report_is_written_for_hello_world() { + let root = repo_root(); + let trace_path = root.join("bioscripts/output/hello-world.trace.tsv"); + if trace_path.exists() { + fs::remove_file(&trace_path).unwrap(); + } + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--trace-report") + .arg("bioscripts/output/hello-world.trace.tsv") + .arg("bioscripts/hello-world.py") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let trace = fs::read_to_string(trace_path).unwrap(); + assert!(trace.contains("step\tline\tcode")); + assert!(trace.contains("hello from bioscript")); +} + +#[test] +fn timing_report_is_written_for_hello_world() { + let root = repo_root(); + let timing_path = root.join("bioscripts/output/hello-world.timing.tsv"); + if timing_path.exists() { + fs::remove_file(&timing_path).unwrap(); + } + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--timing-report") + .arg("bioscripts/output/hello-world.timing.tsv") + .arg("bioscripts/hello-world.py") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let timing = fs::read_to_string(timing_path).unwrap(); + assert!(timing.contains("stage\tduration_ms\tdetail")); + assert!(timing.contains("run_file_total\t")); + assert!(timing.contains("script=bioscripts/hello-world.py")); +} + +#[test] +fn auto_index_adds_reference_index_timing_for_script_runs() { + let root = repo_root(); + let dir = temp_dir("auto-index-script"); + let cache_dir = dir.join("cache"); + let timing_path = dir.join("reports/timing.tsv"); + fs::write(dir.join("ref.fa"), b">chr1\nACGT\n").unwrap(); + fs::write( + dir.join("script.py"), + r#" +def main(): + print("indexed") + + +if __name__ == "__main__": + main() +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--root") + .arg(&dir) + .arg("--reference-file") + .arg("ref.fa") + .arg("--auto-index") + .arg("--cache-dir") + .arg(&cache_dir) + .arg("--timing-report") + .arg(&timing_path) + .arg(dir.join("script.py")) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("bioscript: auto-indexed reference ->"), + "{stderr}" + ); + assert!(fs::read_dir(&cache_dir).unwrap().any(|entry| { + entry + .unwrap() + .path() + .extension() + .is_some_and(|ext| ext == "fai") + })); + let timing = fs::read_to_string(timing_path).unwrap(); + assert!(timing.contains("auto_index\t"), "{timing}"); + assert!(timing.contains("run_file_total\t"), "{timing}"); +} + +#[test] +fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { + let root = repo_root(); + let script = root.join("rust/bioscript-cli/tests/fixtures/batch_lookup.py"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(script) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("AG")); + assert!(stdout.contains("TC")); + assert!(stdout.contains("II")); +} + +#[test] +fn lookup_variant_details_returns_counts_and_decision_fields() { + let root = repo_root(); + let script = root.join("rust/bioscript-cli/tests/fixtures/lookup_details.py"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(script) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("VariantObservation")); + assert!(stdout.contains("genotype='AG'")); + assert!(stdout.contains("raw_counts={")); + assert!(stdout.contains("decision=")); + assert!(stdout.contains("evidence=[")); +} diff --git a/rust/bioscript-cli/tests/cli/subcommands.rs b/rust/bioscript-cli/tests/cli/subcommands.rs new file mode 100644 index 0000000..2bfecb2 --- /dev/null +++ b/rust/bioscript-cli/tests/cli/subcommands.rs @@ -0,0 +1,164 @@ +use super::*; + +#[test] +fn inspect_subcommand_reports_detected_vendor_and_platform() { + let root = repo_root(); + let path = root.join("rust/bioscript-formats/tests/fixtures/ancestrydna_v2_sample.txt"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("inspect") + .arg(path) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("kind\tgenotype_text")); + assert!(stdout.contains("vendor\tAncestryDNA")); + assert!(stdout.contains("platform_version\tV2.0")); + assert!(stdout.contains("assembly\tgrch37")); + assert!(stdout.contains("duration_ms\t")); +} + +#[test] +fn prepare_subcommand_reports_reference_index_flags() { + let root = repo_root(); + let dir = temp_dir("prepare-cli"); + let reference = dir.join("ref.fa"); + fs::write(&reference, b">chr1\nACGT\n").unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("prepare") + .arg("--root") + .arg(&dir) + .arg("--reference-file") + .arg("ref.fa") + .arg("--cache-dir") + .arg("cache") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("--reference-file")); + assert!(stdout.contains("--reference-index")); + assert!(stdout.contains("cache")); +} + +#[test] +fn prepare_subcommand_reports_nothing_to_index_for_noop_auto_request() { + let root = repo_root(); + let dir = temp_dir("prepare-noop-cli"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("prepare") + .arg("--root") + .arg(&dir) + .arg("--input-format") + .arg("auto") + .output() + .unwrap(); + + assert!(output.status.success(), "stderr: {}", stderr_text(&output)); + assert!(String::from_utf8_lossy(&output.stdout).is_empty()); + assert!( + stderr_text(&output).contains("bioscript prepare: nothing to index"), + "{}", + stderr_text(&output) + ); +} + +#[test] +fn validate_variants_cli_returns_nonzero_and_writes_report() { + let root = repo_root(); + let dir = temp_dir("validate-variants-cli"); + let manifest = dir.join("bad-variant.yaml"); + let report = dir.join("reports/variants.txt"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant" +version: "1.0" +variant_id: "TEST_bad" +name: "bad" +identifiers: + rsids: + - "bad-rsid" +coordinates: + grch38: + chrom: "chrUn" + pos: 0 +alleles: + kind: "snv" + ref: "AA" + alts: [] +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("validate-variants") + .arg(&manifest) + .arg("--report") + .arg(&report) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("validation found"), "{stderr}"); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("errors"), "{stdout}"); + let report_text = fs::read_to_string(report).unwrap(); + assert!(report_text.contains("bad-rsid")); +} + +#[test] +fn validate_panels_cli_returns_nonzero_and_writes_report() { + let root = repo_root(); + let dir = temp_dir("validate-panels-cli"); + let panel = dir.join("bad-panel.yaml"); + let report = dir.join("reports/panels.txt"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "bad-panel" +members: + - kind: "variant" + path: "../outside.yaml" + sha256: "not-a-sha" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("validate-panels") + .arg(&panel) + .arg("--report") + .arg(&report) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("validation found"), "{stderr}"); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("errors"), "{stdout}"); + let report_text = fs::read_to_string(report).unwrap(); + assert!(report_text.contains("members[0].sha256")); +} diff --git a/rust/bioscript-core/tests/source_size.rs b/rust/bioscript-core/tests/source_size.rs new file mode 100644 index 0000000..1722186 --- /dev/null +++ b/rust/bioscript-core/tests/source_size.rs @@ -0,0 +1,179 @@ +use std::{ + collections::BTreeSet, + fs, + path::{Path, PathBuf}, +}; + +const MAX_PRODUCTION_LINES: usize = 500; + +#[test] +fn production_rust_files_stay_under_size_limit() { + let repo_root = repo_root(); + let documented_backlog = documented_refactor_backlog(&repo_root); + let mut actual_oversized = BTreeSet::new(); + let mut failures = Vec::new(); + + for file in production_rust_files(&repo_root) { + let relative = relative_slash_path(&repo_root, &file); + let source = fs::read_to_string(&file) + .unwrap_or_else(|err| panic!("failed to read {relative}: {err}")); + let line_count = production_line_count(&source); + + if line_count > MAX_PRODUCTION_LINES { + actual_oversized.insert(relative.clone()); + + if !documented_backlog.contains(&relative) { + failures.push(format!( + "{relative} has {line_count} production lines; split it or add it to AGENTS.md" + )); + } + } + } + + for documented in documented_backlog.difference(&actual_oversized) { + failures.push(format!( + "{documented} is listed in AGENTS.md but is no longer above {MAX_PRODUCTION_LINES} production lines" + )); + } + + assert!( + failures.is_empty(), + "production source size guard failed:\n{}", + failures.join("\n") + ); +} + +fn repo_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(2) + .expect("bioscript-core should live under rust/") + .to_path_buf() +} + +fn production_rust_files(repo_root: &Path) -> Vec { + let rust_dir = repo_root.join("rust"); + let mut files = Vec::new(); + + for package in fs::read_dir(&rust_dir).expect("failed to read rust workspace directory") { + let package = package.expect("failed to read rust workspace entry"); + let package_name = package.file_name(); + let package_name = package_name.to_string_lossy(); + + if !package_name.starts_with("bioscript-") { + continue; + } + + let src_dir = package.path().join("src"); + if src_dir.is_dir() { + collect_rust_files(&src_dir, &mut files); + } + } + + files.sort(); + files +} + +fn documented_refactor_backlog(repo_root: &Path) -> BTreeSet { + let agents_path = repo_root.join("AGENTS.md"); + let agents = fs::read_to_string(&agents_path) + .unwrap_or_else(|err| panic!("failed to read {}: {err}", agents_path.display())); + let mut in_backlog = false; + let mut paths = BTreeSet::new(); + + for line in agents.lines() { + if line.starts_with("## ") { + in_backlog = line == "## Current Refactor Backlog"; + continue; + } + + if !in_backlog { + continue; + } + + if let Some(path) = markdown_code_span(line) { + paths.insert(path.to_owned()); + } + } + + paths +} + +fn markdown_code_span(line: &str) -> Option<&str> { + let start = line.find('`')? + 1; + let end = line[start..].find('`')? + start; + Some(&line[start..end]) +} + +fn collect_rust_files(dir: &Path, files: &mut Vec) { + for entry in + fs::read_dir(dir).unwrap_or_else(|err| panic!("failed to read {}: {err}", dir.display())) + { + let entry = + entry.unwrap_or_else(|err| panic!("failed to read entry in {}: {err}", dir.display())); + let path = entry.path(); + + if path.is_dir() { + collect_rust_files(&path, files); + } else if path.extension().is_some_and(|ext| ext == "rs") { + files.push(path); + } + } +} + +fn relative_slash_path(repo_root: &Path, file: &Path) -> String { + file.strip_prefix(repo_root) + .expect("file should be inside repository") + .components() + .map(|component| component.as_os_str().to_string_lossy()) + .collect::>() + .join("/") +} + +fn production_line_count(source: &str) -> usize { + let mut count = 0; + let mut pending_cfg_test = false; + let mut skipped_brace_depth = None; + let mut brace_depth = 0usize; + + for line in source.lines() { + let trimmed = line.trim_start(); + + if let Some(target_depth) = skipped_brace_depth { + update_brace_depth(line, &mut brace_depth); + if brace_depth < target_depth { + skipped_brace_depth = None; + } + continue; + } + + if trimmed.starts_with("#[cfg(test)]") { + pending_cfg_test = true; + continue; + } + + let starts_test_module = pending_cfg_test && trimmed.starts_with("mod "); + pending_cfg_test = pending_cfg_test && trimmed.starts_with("#["); + + if starts_test_module { + update_brace_depth(line, &mut brace_depth); + skipped_brace_depth = Some(brace_depth); + continue; + } + + count += 1; + update_brace_depth(line, &mut brace_depth); + } + + count +} + +fn update_brace_depth(line: &str, brace_depth: &mut usize) { + for byte in line.bytes() { + match byte { + b'{' => *brace_depth += 1, + b'}' => *brace_depth = brace_depth.saturating_sub(1), + _ => {} + } + } +} diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index 0c97464..2039e3a 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -1,24 +1,25 @@ use std::{ - collections::{BTreeMap, HashSet}, - io::{BufRead, Read, Seek}, + io::{Read, Seek}, path::Path, }; -use noodles::{ - core::{Position, Region, region::Interval}, - cram::{self, crai, io::reader::Container}, - fasta::{self, repository::adapters::IndexedReader as FastaIndexedReader}, - sam::{ - self, - alignment::{Record as _, record::Cigar as _}, - }, - tabix, -}; +use noodles::cram; use bioscript_core::{GenomicLocus, RuntimeError}; use crate::genotype::GenotypeLoadOptions; +mod cram_stream; +mod readers; + +pub use readers::{ + build_cram_indexed_reader_from_reader, build_reference_repository_from_readers, + parse_crai_bytes, parse_fai_bytes, parse_tbi_bytes, +}; + +pub(crate) use cram_stream::for_each_raw_cram_record_with_reader_inner; +pub(crate) use readers::{build_cram_indexed_reader_from_path, build_reference_repository}; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AlignmentOpKind { Match, @@ -46,12 +47,6 @@ pub struct AlignmentRecord { pub cigar: Vec, } -#[derive(Debug, Clone)] -struct SelectedContainer { - offset: u64, - landmarks: HashSet, -} - pub(crate) fn for_each_cram_record( path: &Path, options: &GenotypeLoadOptions, @@ -65,7 +60,7 @@ where let repository = build_reference_repository(reference_file)?; let mut reader = build_cram_indexed_reader_from_path(path, options, repository)?; let label = path.display().to_string(); - for_each_cram_record_with_reader_inner( + cram_stream::for_each_cram_record_with_reader_inner( &mut reader, &label, locus, @@ -101,50 +96,7 @@ where R: Read + Seek, F: FnMut(AlignmentRecord) -> Result, { - for_each_cram_record_with_reader_inner(reader, label, locus, false, on_record) -} - -fn for_each_cram_record_with_reader_inner( - reader: &mut cram::io::indexed_reader::IndexedReader, - label: &str, - locus: &GenomicLocus, - allow_reference_md5_mismatch: bool, - mut on_record: F, -) -> Result<(), RuntimeError> -where - R: Read + Seek, - F: FnMut(AlignmentRecord) -> Result, -{ - // Same idempotent-rewind rationale as `for_each_raw_cram_record_with_reader` - // — the CRAM header lives at offset 0; we must rewind before each call so - // callers can iterate over multiple loci with the same `IndexedReader`. - reader - .get_mut() - .seek(std::io::SeekFrom::Start(0)) - .map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?; - let header = reader - .read_header() - .map_err(|err| RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")))?; - - let region = build_region(&header, locus).ok_or_else(|| { - RuntimeError::Unsupported(format!( - "indexed CRAM does not contain contig {} for {}:{}-{}", - locus.chrom, locus.chrom, locus.start, locus.end - )) - })?; - - let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; - - stream_selected_alignment_records( - label, - reader, - &header, - ®ion, - locus.end, - &selected_containers, - allow_reference_md5_mismatch, - &mut on_record, - ) + cram_stream::for_each_cram_record_with_reader_inner(reader, label, locus, false, on_record) } /// Iterate raw CRAM records intersecting `locus`, streaming from an @@ -164,568 +116,9 @@ where for_each_raw_cram_record_with_reader_inner(reader, label, locus, false, on_record) } -pub(crate) fn for_each_raw_cram_record_with_reader_inner( - reader: &mut cram::io::indexed_reader::IndexedReader, - label: &str, - locus: &GenomicLocus, - allow_reference_md5_mismatch: bool, - mut on_record: F, -) -> Result<(), RuntimeError> -where - R: Read + Seek, - F: FnMut(cram::Record<'_>) -> Result, -{ - // Re-seeks to position 0 before reading the header so this helper is - // idempotent across repeated calls on the same indexed reader (e.g. a - // wasm caller looking up N variants in a loop). Otherwise the second - // call reads garbage because the stream position is wherever the - // previous container iteration left it. - reader - .get_mut() - .seek(std::io::SeekFrom::Start(0)) - .map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?; - let header = reader - .read_header() - .map_err(|err| RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")))?; - - let region = build_region(&header, locus).ok_or_else(|| { - RuntimeError::Unsupported(format!( - "indexed CRAM does not contain contig {} for {}:{}-{}", - locus.chrom, locus.chrom, locus.start, locus.end - )) - })?; - - let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; - - stream_selected_cram_records( - label, - reader, - &header, - ®ion, - locus.end, - &selected_containers, - allow_reference_md5_mismatch, - &mut on_record, - ) -} - -/// Build a CRAM `IndexedReader` over any `Read + Seek` source given a parsed -/// CRAI index and a reference repository. Mirrors `build_from_path` but with -/// an externally-provided reader — the wasm path uses this with a JS-backed -/// reader; native paths still go through the path-based helper below. -pub fn build_cram_indexed_reader_from_reader( - reader: R, - crai_index: crai::Index, - repository: fasta::Repository, -) -> Result, RuntimeError> -where - R: Read, -{ - cram::io::indexed_reader::Builder::default() - .set_reference_sequence_repository(repository) - .set_index(crai_index) - .build_from_reader(reader) - .map_err(|err| RuntimeError::Io(format!("failed to build indexed CRAM reader: {err}"))) -} - -/// Build a FASTA `Repository` over any `BufRead + Seek + Send + Sync` source -/// given a parsed FAI index. The `Send + Sync + 'static` bounds come from -/// `fasta::Repository`'s internal `Arc>` -/// cache — on single-threaded wasm32 these can be met via `unsafe impl`. -pub fn build_reference_repository_from_readers( - reader: R, - fai_index: fasta::fai::Index, -) -> fasta::Repository -where - R: BufRead + Seek + Send + Sync + 'static, -{ - let indexed = fasta::io::IndexedReader::new(reader, fai_index); - fasta::Repository::new(FastaIndexedReader::new(indexed)) -} - -/// Parse a CRAM index (`.crai`) from an in-memory byte buffer. Used by wasm -/// callers that receive the small index inline while the big CRAM stays on a -/// JS-backed reader. -pub fn parse_crai_bytes(bytes: &[u8]) -> Result { - crai::io::Reader::new(std::io::Cursor::new(bytes)) - .read_index() - .map_err(|err| RuntimeError::Io(format!("failed to parse CRAM index bytes: {err}"))) -} - -/// Parse a FASTA index (`.fai`) from an in-memory byte buffer. -pub fn parse_fai_bytes(bytes: &[u8]) -> Result { - fasta::fai::io::Reader::new(std::io::Cursor::new(bytes)) - .read_index() - .map_err(|err| RuntimeError::Io(format!("failed to parse FASTA index bytes: {err}"))) -} - -/// Parse a tabix index (`.tbi`) from an in-memory byte buffer. Used by wasm -/// callers that pass the small index inline while the bgzipped VCF stays on -/// a JS-backed `Read + Seek` reader. -pub fn parse_tbi_bytes(bytes: &[u8]) -> Result { - tabix::io::Reader::new(std::io::Cursor::new(bytes)) - .read_index() - .map_err(|err| RuntimeError::Io(format!("failed to parse tabix index bytes: {err}"))) -} - -pub(crate) fn build_cram_indexed_reader_from_path( - path: &Path, - options: &GenotypeLoadOptions, - repository: fasta::Repository, -) -> Result, RuntimeError> { - let mut builder = - cram::io::indexed_reader::Builder::default().set_reference_sequence_repository(repository); - - if let Some(index_path) = options.input_index.as_ref() { - let index = crai::fs::read(index_path).map_err(|err| { - RuntimeError::Io(format!( - "failed to read CRAM index {} for {}: {err}", - index_path.display(), - path.display() - )) - })?; - builder = builder.set_index(index); - } - - builder.build_from_path(path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open indexed CRAM {}: {err}", - path.display() - )) - }) -} - -pub(crate) fn build_reference_repository( - reference_file: &Path, -) -> Result { - let reader = fasta::io::indexed_reader::Builder::default() - .build_from_path(reference_file) - .map_err(|err| { - RuntimeError::Io(format!( - "failed to open indexed FASTA {}: {err}", - reference_file.display() - )) - })?; - - Ok(fasta::Repository::new(FastaIndexedReader::new(reader))) -} - -fn build_region(header: &sam::Header, locus: &GenomicLocus) -> Option { - let chrom = resolve_reference_name(header, &locus.chrom)?; - let start = Position::try_from(usize::try_from(locus.start).ok()?).ok()?; - let end = Position::try_from(usize::try_from(locus.end).ok()?).ok()?; - let raw = format!("{chrom}:{start}-{end}"); - raw.parse().ok() -} - -fn select_query_containers( - index: &crai::Index, - header: &sam::Header, - region: &Region, -) -> Result, RuntimeError> { - let reference_sequence_id = - resolve_reference_sequence_id(header, region.name()).ok_or_else(|| { - RuntimeError::Unsupported(format!( - "indexed CRAM does not contain contig {}", - String::from_utf8_lossy(region.name()) - )) - })?; - - let interval = region.interval(); - let mut containers = BTreeMap::>::new(); - - for record in index { - if record.reference_sequence_id() != Some(reference_sequence_id) { - continue; - } - - if !record_intersects_interval(record, interval) { - continue; - } - - containers - .entry(record.offset()) - .or_default() - .insert(record.landmark()); - } - - Ok(containers - .into_iter() - .map(|(offset, landmarks)| SelectedContainer { offset, landmarks }) - .collect()) -} - -fn stream_selected_alignment_records( - label: &str, - reader: &mut cram::io::indexed_reader::IndexedReader, - header: &sam::Header, - region: &Region, - locus_end: i64, - selected_containers: &[SelectedContainer], - allow_reference_md5_mismatch: bool, - on_record: &mut F, -) -> Result<(), RuntimeError> -where - R: Read + Seek, - F: FnMut(AlignmentRecord) -> Result, -{ - stream_selected_cram_records( - label, - reader, - header, - region, - locus_end, - selected_containers, - allow_reference_md5_mismatch, - &mut |record| { - let alignment_record = build_alignment_record_from_cram(label, &record)?; - on_record(alignment_record) - }, - ) -} - -fn stream_selected_cram_records( - label: &str, - reader: &mut cram::io::indexed_reader::IndexedReader, - header: &sam::Header, - region: &Region, - locus_end: i64, - selected_containers: &[SelectedContainer], - allow_reference_md5_mismatch: bool, - on_record: &mut F, -) -> Result<(), RuntimeError> -where - R: Read + Seek, - F: FnMut(cram::Record<'_>) -> Result, -{ - let interval = region.interval(); - - for selected_container in selected_containers { - let offset = selected_container.offset; - reader - .get_mut() - .seek(std::io::SeekFrom::Start(offset)) - .map_err(|err| { - RuntimeError::Io(format!( - "failed to seek CRAM container at offset {offset} in {label}: {err}" - )) - })?; - - let mut container = Container::default(); - let len = reader.read_container(&mut container).map_err(|err| { - RuntimeError::Io(format!( - "failed to read CRAM container at offset {offset} in {label}: {err}" - )) - })?; - - if len == 0 { - break; - } - - let compression_header = container.compression_header().map_err(|err| { - RuntimeError::Io(format!( - "failed to decode CRAM compression header from {label}: {err}" - )) - })?; - - let landmarks = container.header().landmarks().to_vec(); - let reference_sequence_repository = reader.reference_sequence_repository().clone(); - - let mut stop = false; - - for (index, slice_result) in container.slices().enumerate() { - let slice = slice_result.map_err(|err| { - RuntimeError::Io(format!("failed to read CRAM slice from {label}: {err}")) - })?; - - let Some(&landmark_i32) = landmarks.get(index) else { - return Err(RuntimeError::Io(format!( - "missing CRAM slice landmark {index} in {label}" - ))); - }; - let Ok(landmark) = u64::try_from(landmark_i32) else { - continue; - }; - if !selected_container.landmarks.contains(&landmark) { - continue; - } - - let (core_data_src, external_data_srcs) = slice.decode_blocks().map_err(|err| { - RuntimeError::Io(format!( - "failed to decode CRAM slice blocks from {label}: {err}" - )) - })?; - - let mut callback_err: Option = None; - let decode_result = slice.records_while( - reference_sequence_repository.clone(), - header, - &compression_header, - &core_data_src, - &external_data_srcs, - true, - |record| { - Ok(handle_decoded_cram_record( - label, - record, - interval, - locus_end, - &mut stop, - &mut callback_err, - on_record, - )) - }, - ); - - match decode_result { - Ok(()) => {} - Err(err) if allow_reference_md5_mismatch && is_reference_md5_mismatch(&err) => { - eprintln!( - "[bioscript] warning: CRAM reference MD5 mismatch for {label} slice landmark {landmark} — \ - retrying without checksum validation. Results may be incorrect if the \ - supplied reference differs from the one used to encode this CRAM. \ - Details: {err}" - ); - callback_err = None; - stop = false; - slice - .records_while( - reference_sequence_repository.clone(), - header, - &compression_header, - &core_data_src, - &external_data_srcs, - false, - |record| { - Ok(handle_decoded_cram_record( - label, - record, - interval, - locus_end, - &mut stop, - &mut callback_err, - on_record, - )) - }, - ) - .map_err(|err| { - RuntimeError::Io(format!( - "failed to decode CRAM slice records from {label} (unchecked): {err}" - )) - })?; - } - Err(err) if is_reference_md5_mismatch(&err) => { - return Err(RuntimeError::Io(format!( - "CRAM reference MD5 mismatch for {label} slice landmark {landmark}; rerun with --allow-md5-mismatch only if this lenient decode is intentional. Details: {err}" - ))); - } - Err(err) => { - return Err(RuntimeError::Io(format!( - "failed to decode CRAM slice records from {label}: {err}" - ))); - } - } - - if let Some(err) = callback_err { - return Err(err); - } - - if stop { - break; - } - } - - if stop { - break; - } - } - - Ok(()) -} - -fn handle_decoded_cram_record( - label: &str, - record: &cram::Record<'_>, - interval: Interval, - locus_end: i64, - stop: &mut bool, - callback_err: &mut Option, - on_record: &mut F, -) -> bool -where - F: FnMut(cram::Record<'_>) -> Result, -{ - let alignment_record = match build_alignment_record_from_cram(label, record) { - Ok(record) => record, - Err(err) => { - *callback_err = Some(err); - return false; - } - }; - - if alignment_record.start > locus_end { - *stop = true; - return false; - } - - if !alignment_record_intersects_interval(&alignment_record, interval) { - return true; - } - - match on_record(record.clone()) { - Ok(true) => true, - Ok(false) => { - *stop = true; - false - } - Err(err) => { - *callback_err = Some(err); - false - } - } -} - -fn is_reference_md5_mismatch(err: &std::io::Error) -> bool { - err.to_string() - .contains("reference sequence checksum mismatch") -} - -fn build_alignment_record_from_cram( - label: &str, - record: &cram::Record<'_>, -) -> Result { - let flags = record.flags().map_err(|err| { - RuntimeError::Io(format!( - "failed to read CRAM record flags from {label}: {err}" - )) - })?; - let is_unmapped = flags.is_unmapped(); - - let start = match record.alignment_start() { - Some(Ok(pos)) => i64::try_from(usize::from(pos)).map_err(|_| { - RuntimeError::Unsupported(format!( - "record alignment start exceeds i64 range in {label}" - )) - })?, - Some(Err(err)) => { - return Err(RuntimeError::Io(format!( - "failed to read CRAM alignment_start from {label}: {err}" - ))); - } - None => 0, - }; - - let end = match record.alignment_end() { - Some(Ok(pos)) => i64::try_from(usize::from(pos)).map_err(|_| { - RuntimeError::Unsupported(format!("record alignment end exceeds i64 range in {label}")) - })?, - Some(Err(err)) => { - return Err(RuntimeError::Io(format!( - "failed to read CRAM alignment_end from {label}: {err}" - ))); - } - None => start, - }; - - let cigar = record - .cigar() - .iter() - .map(|result| { - result.map(map_op).map_err(|err| { - RuntimeError::Io(format!("failed to read record CIGAR from {label}: {err}")) - }) - }) - .collect::, _>>()?; - - Ok(AlignmentRecord { - start, - end, - is_unmapped, - cigar, - }) -} - -fn resolve_reference_sequence_id(header: &sam::Header, name: &[u8]) -> Option { - header - .reference_sequences() - .iter() - .position(|(candidate, _)| { - let candidate_name: &[u8] = candidate.as_ref(); - candidate_name == name - }) -} - -fn record_intersects_interval( - record: &crai::Record, - interval: noodles::core::region::Interval, -) -> bool { - let Some(start) = record.alignment_start() else { - return false; - }; - - if record.alignment_span() == 0 { - return false; - } - - let Some(end) = start.checked_add(record.alignment_span() - 1) else { - return false; - }; - - interval.intersects((start..=end).into()) -} - -fn alignment_record_intersects_interval( - record: &AlignmentRecord, - interval: noodles::core::region::Interval, -) -> bool { - let Ok(start) = usize::try_from(record.start).and_then(Position::try_from) else { - return false; - }; - let Ok(end) = usize::try_from(record.end).and_then(Position::try_from) else { - return false; - }; - - interval.intersects((start..=end).into()) -} - -fn resolve_reference_name(header: &sam::Header, chrom: &str) -> Option { - let candidates = [ - chrom.to_owned(), - format!("chr{chrom}"), - chrom.trim_start_matches("chr").to_owned(), - ]; - - candidates.into_iter().find(|candidate| { - header.reference_sequences().iter().any(|(name, _)| { - let name_bytes: &[u8] = name.as_ref(); - name_bytes == candidate.as_bytes() - }) - }) -} - -fn map_op(op: sam::alignment::record::cigar::Op) -> AlignmentOp { - use sam::alignment::record::cigar::op::Kind; - - let kind = match op.kind() { - Kind::Match => AlignmentOpKind::Match, - Kind::Insertion => AlignmentOpKind::Insertion, - Kind::Deletion => AlignmentOpKind::Deletion, - Kind::Skip => AlignmentOpKind::Skip, - Kind::SoftClip => AlignmentOpKind::SoftClip, - Kind::HardClip => AlignmentOpKind::HardClip, - Kind::Pad => AlignmentOpKind::Pad, - Kind::SequenceMatch => AlignmentOpKind::SequenceMatch, - Kind::SequenceMismatch => AlignmentOpKind::SequenceMismatch, - }; - - AlignmentOp { - kind, - len: op.len(), - } -} - #[cfg(test)] mod tests { + use super::cram_stream::*; use super::*; use std::{fs::File, num::NonZero, path::PathBuf}; @@ -734,6 +127,11 @@ mod tests { alignment::record::cigar::{Op, op::Kind}, header::record::value::{Map, map::ReferenceSequence}, }; + use noodles::{ + core::{Position, Region}, + cram::crai, + fasta, + }; fn locus(chrom: &str, start: i64, end: i64) -> GenomicLocus { GenomicLocus { diff --git a/rust/bioscript-formats/src/alignment/cram_stream.rs b/rust/bioscript-formats/src/alignment/cram_stream.rs new file mode 100644 index 0000000..0b1facf --- /dev/null +++ b/rust/bioscript-formats/src/alignment/cram_stream.rs @@ -0,0 +1,498 @@ +use std::{ + collections::{BTreeMap, HashSet}, + io::{Read, Seek}, +}; + +use noodles::{ + core::{Position, Region, region::Interval}, + cram::{self, crai, io::reader::Container}, + sam::{ + self, + alignment::{Record as _, record::Cigar as _}, + }, +}; + +use bioscript_core::{GenomicLocus, RuntimeError}; + +use super::{AlignmentOp, AlignmentOpKind, AlignmentRecord}; + +#[derive(Debug, Clone)] +pub(crate) struct SelectedContainer { + pub(crate) offset: u64, + pub(crate) landmarks: HashSet, +} + +pub(crate) fn for_each_cram_record_with_reader_inner( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + allow_reference_md5_mismatch: bool, + mut on_record: F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(AlignmentRecord) -> Result, +{ + // Same idempotent-rewind rationale as `for_each_raw_cram_record_with_reader` + // — the CRAM header lives at offset 0; we must rewind before each call so + // callers can iterate over multiple loci with the same `IndexedReader`. + reader + .get_mut() + .seek(std::io::SeekFrom::Start(0)) + .map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?; + let header = reader + .read_header() + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")))?; + + let region = build_region(&header, locus).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "indexed CRAM does not contain contig {} for {}:{}-{}", + locus.chrom, locus.chrom, locus.start, locus.end + )) + })?; + + let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; + + stream_selected_alignment_records( + label, + reader, + &header, + ®ion, + locus.end, + &selected_containers, + allow_reference_md5_mismatch, + &mut on_record, + ) +} + +pub(crate) fn for_each_raw_cram_record_with_reader_inner( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + allow_reference_md5_mismatch: bool, + mut on_record: F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(cram::Record<'_>) -> Result, +{ + // Re-seeks to position 0 before reading the header so this helper is + // idempotent across repeated calls on the same indexed reader (e.g. a + // wasm caller looking up N variants in a loop). Otherwise the second + // call reads garbage because the stream position is wherever the + // previous container iteration left it. + reader + .get_mut() + .seek(std::io::SeekFrom::Start(0)) + .map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?; + let header = reader + .read_header() + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")))?; + + let region = build_region(&header, locus).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "indexed CRAM does not contain contig {} for {}:{}-{}", + locus.chrom, locus.chrom, locus.start, locus.end + )) + })?; + + let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; + + stream_selected_cram_records( + label, + reader, + &header, + ®ion, + locus.end, + &selected_containers, + allow_reference_md5_mismatch, + &mut on_record, + ) +} + +pub(crate) fn build_region(header: &sam::Header, locus: &GenomicLocus) -> Option { + let chrom = resolve_reference_name(header, &locus.chrom)?; + let start = Position::try_from(usize::try_from(locus.start).ok()?).ok()?; + let end = Position::try_from(usize::try_from(locus.end).ok()?).ok()?; + let raw = format!("{chrom}:{start}-{end}"); + raw.parse().ok() +} + +pub(crate) fn select_query_containers( + index: &crai::Index, + header: &sam::Header, + region: &Region, +) -> Result, RuntimeError> { + let reference_sequence_id = + resolve_reference_sequence_id(header, region.name()).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "indexed CRAM does not contain contig {}", + String::from_utf8_lossy(region.name()) + )) + })?; + + let interval = region.interval(); + let mut containers = BTreeMap::>::new(); + + for record in index { + if record.reference_sequence_id() != Some(reference_sequence_id) { + continue; + } + + if !record_intersects_interval(record, interval) { + continue; + } + + containers + .entry(record.offset()) + .or_default() + .insert(record.landmark()); + } + + Ok(containers + .into_iter() + .map(|(offset, landmarks)| SelectedContainer { offset, landmarks }) + .collect()) +} + +fn stream_selected_alignment_records( + label: &str, + reader: &mut cram::io::indexed_reader::IndexedReader, + header: &sam::Header, + region: &Region, + locus_end: i64, + selected_containers: &[SelectedContainer], + allow_reference_md5_mismatch: bool, + on_record: &mut F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(AlignmentRecord) -> Result, +{ + stream_selected_cram_records( + label, + reader, + header, + region, + locus_end, + selected_containers, + allow_reference_md5_mismatch, + &mut |record| { + let alignment_record = build_alignment_record_from_cram(label, &record)?; + on_record(alignment_record) + }, + ) +} + +fn stream_selected_cram_records( + label: &str, + reader: &mut cram::io::indexed_reader::IndexedReader, + header: &sam::Header, + region: &Region, + locus_end: i64, + selected_containers: &[SelectedContainer], + allow_reference_md5_mismatch: bool, + on_record: &mut F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(cram::Record<'_>) -> Result, +{ + let interval = region.interval(); + + for selected_container in selected_containers { + let offset = selected_container.offset; + reader + .get_mut() + .seek(std::io::SeekFrom::Start(offset)) + .map_err(|err| { + RuntimeError::Io(format!( + "failed to seek CRAM container at offset {offset} in {label}: {err}" + )) + })?; + + let mut container = Container::default(); + let len = reader.read_container(&mut container).map_err(|err| { + RuntimeError::Io(format!( + "failed to read CRAM container at offset {offset} in {label}: {err}" + )) + })?; + + if len == 0 { + break; + } + + let compression_header = container.compression_header().map_err(|err| { + RuntimeError::Io(format!( + "failed to decode CRAM compression header from {label}: {err}" + )) + })?; + + let landmarks = container.header().landmarks().to_vec(); + let reference_sequence_repository = reader.reference_sequence_repository().clone(); + + let mut stop = false; + + for (index, slice_result) in container.slices().enumerate() { + let slice = slice_result.map_err(|err| { + RuntimeError::Io(format!("failed to read CRAM slice from {label}: {err}")) + })?; + + let Some(&landmark_i32) = landmarks.get(index) else { + return Err(RuntimeError::Io(format!( + "missing CRAM slice landmark {index} in {label}" + ))); + }; + let Ok(landmark) = u64::try_from(landmark_i32) else { + continue; + }; + if !selected_container.landmarks.contains(&landmark) { + continue; + } + + let (core_data_src, external_data_srcs) = slice.decode_blocks().map_err(|err| { + RuntimeError::Io(format!( + "failed to decode CRAM slice blocks from {label}: {err}" + )) + })?; + + let records = slice.records( + reference_sequence_repository.clone(), + header, + &compression_header, + &core_data_src, + &external_data_srcs, + ); + + match records { + Ok(records) => { + let mut callback_err: Option = None; + for record in &records { + if !handle_decoded_cram_record( + label, + record, + interval, + locus_end, + &mut stop, + &mut callback_err, + on_record, + ) { + break; + } + } + if let Some(err) = callback_err { + return Err(err); + } + } + Err(err) if allow_reference_md5_mismatch && is_reference_md5_mismatch(&err) => { + eprintln!( + "[bioscript] warning: CRAM reference MD5 mismatch for {label} slice landmark {landmark} — \ + this noodles version cannot retry without checksum validation. \ + Details: {err}" + ); + } + Err(err) if is_reference_md5_mismatch(&err) => { + return Err(RuntimeError::Io(format!( + "CRAM reference MD5 mismatch for {label} slice landmark {landmark}; rerun with --allow-md5-mismatch only if this lenient decode is intentional. Details: {err}" + ))); + } + Err(err) => { + return Err(RuntimeError::Io(format!( + "failed to decode CRAM slice records from {label}: {err}" + ))); + } + } + + if stop { + break; + } + } + + if stop { + break; + } + } + + Ok(()) +} + +fn handle_decoded_cram_record( + label: &str, + record: &cram::Record<'_>, + interval: Interval, + locus_end: i64, + stop: &mut bool, + callback_err: &mut Option, + on_record: &mut F, +) -> bool +where + F: FnMut(cram::Record<'_>) -> Result, +{ + let alignment_record = match build_alignment_record_from_cram(label, record) { + Ok(record) => record, + Err(err) => { + *callback_err = Some(err); + return false; + } + }; + + if alignment_record.start > locus_end { + *stop = true; + return false; + } + + if !alignment_record_intersects_interval(&alignment_record, interval) { + return true; + } + + match on_record(record.clone()) { + Ok(true) => true, + Ok(false) => { + *stop = true; + false + } + Err(err) => { + *callback_err = Some(err); + false + } + } +} + +pub(crate) fn is_reference_md5_mismatch(err: &std::io::Error) -> bool { + err.to_string() + .contains("reference sequence checksum mismatch") +} + +fn build_alignment_record_from_cram( + label: &str, + record: &cram::Record<'_>, +) -> Result { + let flags = record.flags().map_err(|err| { + RuntimeError::Io(format!( + "failed to read CRAM record flags from {label}: {err}" + )) + })?; + let is_unmapped = flags.is_unmapped(); + + let start = match record.alignment_start() { + Some(Ok(pos)) => i64::try_from(usize::from(pos)).map_err(|_| { + RuntimeError::Unsupported(format!( + "record alignment start exceeds i64 range in {label}" + )) + })?, + Some(Err(err)) => { + return Err(RuntimeError::Io(format!( + "failed to read CRAM alignment_start from {label}: {err}" + ))); + } + None => 0, + }; + + let end = match record.alignment_end() { + Some(Ok(pos)) => i64::try_from(usize::from(pos)).map_err(|_| { + RuntimeError::Unsupported(format!("record alignment end exceeds i64 range in {label}")) + })?, + Some(Err(err)) => { + return Err(RuntimeError::Io(format!( + "failed to read CRAM alignment_end from {label}: {err}" + ))); + } + None => start, + }; + + let cigar = record + .cigar() + .iter() + .map(|result| { + result.map(map_op).map_err(|err| { + RuntimeError::Io(format!("failed to read record CIGAR from {label}: {err}")) + }) + }) + .collect::, _>>()?; + + Ok(AlignmentRecord { + start, + end, + is_unmapped, + cigar, + }) +} + +pub(crate) fn resolve_reference_sequence_id(header: &sam::Header, name: &[u8]) -> Option { + header + .reference_sequences() + .iter() + .position(|(candidate, _)| { + let candidate_name: &[u8] = candidate.as_ref(); + candidate_name == name + }) +} + +pub(crate) fn record_intersects_interval( + record: &crai::Record, + interval: noodles::core::region::Interval, +) -> bool { + let Some(start) = record.alignment_start() else { + return false; + }; + + if record.alignment_span() == 0 { + return false; + } + + let Some(end) = start.checked_add(record.alignment_span() - 1) else { + return false; + }; + + interval.intersects((start..=end).into()) +} + +pub(crate) fn alignment_record_intersects_interval( + record: &AlignmentRecord, + interval: noodles::core::region::Interval, +) -> bool { + let Ok(start) = usize::try_from(record.start).and_then(Position::try_from) else { + return false; + }; + let Ok(end) = usize::try_from(record.end).and_then(Position::try_from) else { + return false; + }; + + interval.intersects((start..=end).into()) +} + +pub(crate) fn resolve_reference_name(header: &sam::Header, chrom: &str) -> Option { + let candidates = [ + chrom.to_owned(), + format!("chr{chrom}"), + chrom.trim_start_matches("chr").to_owned(), + ]; + + candidates.into_iter().find(|candidate| { + header.reference_sequences().iter().any(|(name, _)| { + let name_bytes: &[u8] = name.as_ref(); + name_bytes == candidate.as_bytes() + }) + }) +} + +pub(crate) fn map_op(op: sam::alignment::record::cigar::Op) -> AlignmentOp { + use sam::alignment::record::cigar::op::Kind; + + let kind = match op.kind() { + Kind::Match => AlignmentOpKind::Match, + Kind::Insertion => AlignmentOpKind::Insertion, + Kind::Deletion => AlignmentOpKind::Deletion, + Kind::Skip => AlignmentOpKind::Skip, + Kind::SoftClip => AlignmentOpKind::SoftClip, + Kind::HardClip => AlignmentOpKind::HardClip, + Kind::Pad => AlignmentOpKind::Pad, + Kind::SequenceMatch => AlignmentOpKind::SequenceMatch, + Kind::SequenceMismatch => AlignmentOpKind::SequenceMismatch, + }; + + AlignmentOp { + kind, + len: op.len(), + } +} diff --git a/rust/bioscript-formats/src/alignment/readers.rs b/rust/bioscript-formats/src/alignment/readers.rs new file mode 100644 index 0000000..d90fe9c --- /dev/null +++ b/rust/bioscript-formats/src/alignment/readers.rs @@ -0,0 +1,115 @@ +use std::{ + io::{BufRead, Read, Seek}, + path::Path, +}; + +use noodles::{ + cram::{self, crai}, + fasta::{self, repository::adapters::IndexedReader as FastaIndexedReader}, + tabix, +}; + +use bioscript_core::RuntimeError; + +use crate::genotype::GenotypeLoadOptions; + +/// Build a CRAM `IndexedReader` over any `Read + Seek` source given a parsed +/// CRAI index and a reference repository. Mirrors `build_from_path` but with +/// an externally-provided reader — the wasm path uses this with a JS-backed +/// reader; native paths still go through the path-based helper below. +pub fn build_cram_indexed_reader_from_reader( + reader: R, + crai_index: crai::Index, + repository: fasta::Repository, +) -> Result, RuntimeError> +where + R: Read, +{ + cram::io::indexed_reader::Builder::default() + .set_reference_sequence_repository(repository) + .set_index(crai_index) + .build_from_reader(reader) + .map_err(|err| RuntimeError::Io(format!("failed to build indexed CRAM reader: {err}"))) +} + +/// Build a FASTA `Repository` over any `BufRead + Seek + Send + Sync` source +/// given a parsed FAI index. The `Send + Sync + 'static` bounds come from +/// `fasta::Repository`'s internal `Arc>` +/// cache — on single-threaded wasm32 these can be met via `unsafe impl`. +pub fn build_reference_repository_from_readers( + reader: R, + fai_index: fasta::fai::Index, +) -> fasta::Repository +where + R: BufRead + Seek + Send + Sync + 'static, +{ + let indexed = fasta::io::IndexedReader::new(reader, fai_index); + fasta::Repository::new(FastaIndexedReader::new(indexed)) +} + +/// Parse a CRAM index (`.crai`) from an in-memory byte buffer. Used by wasm +/// callers that receive the small index inline while the big CRAM stays on a +/// JS-backed reader. +pub fn parse_crai_bytes(bytes: &[u8]) -> Result { + crai::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse CRAM index bytes: {err}"))) +} + +/// Parse a FASTA index (`.fai`) from an in-memory byte buffer. +pub fn parse_fai_bytes(bytes: &[u8]) -> Result { + fasta::fai::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse FASTA index bytes: {err}"))) +} + +/// Parse a tabix index (`.tbi`) from an in-memory byte buffer. Used by wasm +/// callers that pass the small index inline while the bgzipped VCF stays on +/// a JS-backed `Read + Seek` reader. +pub fn parse_tbi_bytes(bytes: &[u8]) -> Result { + tabix::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse tabix index bytes: {err}"))) +} + +pub(crate) fn build_cram_indexed_reader_from_path( + path: &Path, + options: &GenotypeLoadOptions, + repository: fasta::Repository, +) -> Result, RuntimeError> { + let mut builder = + cram::io::indexed_reader::Builder::default().set_reference_sequence_repository(repository); + + if let Some(index_path) = options.input_index.as_ref() { + let index = crai::fs::read(index_path).map_err(|err| { + RuntimeError::Io(format!( + "failed to read CRAM index {} for {}: {err}", + index_path.display(), + path.display() + )) + })?; + builder = builder.set_index(index); + } + + builder.build_from_path(path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open indexed CRAM {}: {err}", + path.display() + )) + }) +} + +pub(crate) fn build_reference_repository( + reference_file: &Path, +) -> Result { + let reader = fasta::io::indexed_reader::Builder::default() + .build_from_path(reference_file) + .map_err(|err| { + RuntimeError::Io(format!( + "failed to open indexed FASTA {}: {err}", + reference_file.display() + )) + })?; + + Ok(fasta::Repository::new(FastaIndexedReader::new(reader))) +} diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index b7173fd..4c98424 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -11,7 +11,10 @@ use noodles::bgzf; use noodles::core::{Position, Region}; use noodles::cram; use noodles::csi::{self, BinningIndex}; -use noodles::sam::alignment::Record as _; +use noodles::sam::alignment::{ + Record as _, + record::{Cigar as _, QualityScores as _, Sequence as _, cigar::op::Kind as CigarOpKind}, +}; use noodles::tabix; use zip::ZipArchive; @@ -1030,7 +1033,7 @@ fn snp_pileup_with_reader( } let Some((base, base_quality)) = - record.base_quality_at_reference_position(target_position, reference_base) + cram_base_quality_at_reference_position(&record, target_position, reference_base)? else { return Ok(true); }; @@ -1083,6 +1086,61 @@ fn snp_pileup_with_reader( Ok(counts) } +fn cram_base_quality_at_reference_position( + record: &cram::Record<'_>, + target_position: Position, + reference_base: u8, +) -> Result, RuntimeError> { + let Some(alignment_start) = record.alignment_start() else { + return Ok(None); + }; + let alignment_start = alignment_start + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM alignment start: {err}")))?; + let mut reference_position = usize::from(alignment_start); + let target = usize::from(target_position); + let mut read_position = 0usize; + let sequence = record.sequence(); + let qualities = record.quality_scores(); + + for op in record.cigar().iter() { + let op = op.map_err(|err| RuntimeError::Io(format!("failed to read CRAM CIGAR: {err}")))?; + match op.kind() { + CigarOpKind::Match | CigarOpKind::SequenceMatch | CigarOpKind::SequenceMismatch => { + for offset in 0..op.len() { + if reference_position + offset == target { + let base = sequence + .get(read_position + offset) + .unwrap_or(reference_base); + let quality = qualities + .iter() + .nth(read_position + offset) + .transpose() + .map_err(|err| { + RuntimeError::Io(format!("failed to read CRAM base quality: {err}")) + })? + .unwrap_or(0); + return Ok(Some((base, quality))); + } + } + reference_position += op.len(); + read_position += op.len(); + } + CigarOpKind::Insertion | CigarOpKind::SoftClip => { + read_position += op.len(); + } + CigarOpKind::Deletion | CigarOpKind::Skip => { + if target >= reference_position && target < reference_position + op.len() { + return Ok(None); + } + reference_position += op.len(); + } + CigarOpKind::HardClip | CigarOpKind::Pad => {} + } + } + + Ok(None) +} + /// Observe a SNP at `locus` over an already-built CRAM `IndexedReader` and /// reference repository (held by the reader). Mirrors the internal /// `CramBackend::observe_snp` but reader-based, so non-filesystem callers diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index be6c90f..104ea84 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -72,1474 +72,15 @@ fn zip_bytes(entry_name: &str, contents: &[u8]) -> Vec { writer.finish().unwrap().into_inner() } -#[test] -fn genotype_store_from_bytes_handles_genotype_text() { - let store = GenotypeStore::from_bytes( - "sample.txt", - b"\xef\xbb\xbfrsid\tchromosome\tposition\tgenotype\n\ - # skipped comment\n\ - rs73885319\t22\t36265860\tag\n\ - rs60910145\t22\t36265900\tN/A\n", - ) - .unwrap(); - - assert_eq!(store.backend_name(), "text"); - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); - assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); -} - -#[test] -fn genotype_source_format_parses_supported_values_and_rejects_unknowns() { - assert_eq!( - "txt".parse::().unwrap(), - GenotypeSourceFormat::Text - ); - assert_eq!( - "GENOTYPE".parse::().unwrap(), - GenotypeSourceFormat::Text - ); - assert_eq!( - "zip".parse::().unwrap(), - GenotypeSourceFormat::Zip - ); - assert_eq!( - "vcf".parse::().unwrap(), - GenotypeSourceFormat::Vcf - ); - assert_eq!( - "cram".parse::().unwrap(), - GenotypeSourceFormat::Cram - ); - - let err = "bam".parse::().unwrap_err(); - assert_eq!(err, "unsupported input format: bam"); -} - -#[test] -fn backend_capabilities_match_query_backend_type() { - let rsid_map = GenotypeStore::from_bytes( - "sample.txt", - b"rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", - ) - .unwrap(); - assert_eq!(rsid_map.backend_name(), "text"); - assert!(rsid_map.supports(QueryKind::GenotypeByRsid)); - assert!(!rsid_map.supports(QueryKind::GenotypeByLocus)); - - let dir = temp_dir("backend-capabilities"); - let text_path = dir.join("sample.txt"); - fs::write( - &text_path, - "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", - ) - .unwrap(); - let delimited = GenotypeStore::from_file(&text_path).unwrap(); - assert_eq!(delimited.backend_name(), "text"); - assert!(delimited.supports(QueryKind::GenotypeByRsid)); - assert!(delimited.supports(QueryKind::GenotypeByLocus)); - - let vcf_path = dir.join("sample.vcf"); - fs::write( - &vcf_path, - "##fileformat=VCFv4.2\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - 1\t10\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - let vcf = GenotypeStore::from_file(&vcf_path).unwrap(); - assert_eq!(vcf.backend_name(), "vcf"); - assert!(vcf.supports(QueryKind::GenotypeByRsid)); - assert!(vcf.supports(QueryKind::GenotypeByLocus)); - - let cram = GenotypeStore::from_file_with_options( - &dir.join("sample.dat"), - &GenotypeLoadOptions { - format: Some(GenotypeSourceFormat::Cram), - ..GenotypeLoadOptions::default() - }, - ) - .unwrap(); - assert_eq!(cram.backend_name(), "cram"); - assert!(!cram.supports(QueryKind::GenotypeByRsid)); - assert!(cram.supports(QueryKind::GenotypeByLocus)); -} - -#[test] -fn genotype_store_from_bytes_handles_vcf() { - let store = GenotypeStore::from_bytes( - "sample.vcf", - b"##fileformat=VCFv4.2\n\ - ##FORMAT=\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - 22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - - assert_eq!(store.backend_name(), "vcf"); - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); -} - -#[test] -fn vcf_bytes_skip_unusable_rows_and_decode_no_call_forms() { - let store = GenotypeStore::from_bytes( - "sample.vcf", - b"##fileformat=VCFv4.2\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - 1\t10\t.\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ - 1\t11\trsEmptyRef\t.\tG\t.\tPASS\t.\tGT\t0/1\n\ - 1\t12\trsEmptyAlt\tA\t.\t.\tPASS\t.\tGT\t0/1\n\ - 1\t13\trsShort\tA\tG\n\ - 1\t14\trsNoCall\tA\tG\t.\tPASS\t.\tGT\t.\n\ - 1\t15\trsPartialNoCall\tA\tG\t.\tPASS\t.\tGT\t./1\n\ - 1\t16\trsOutOfRange\tA\tG\t.\tPASS\t.\tGT\t0/2\n\ - 1\t17\trsValid\tC\tT\t.\tPASS\t.\tGT\t1|1\n", - ) - .unwrap(); - - assert_eq!(store.backend_name(), "vcf"); - assert_eq!(store.get("rsValid").unwrap().as_deref(), Some("TT")); - assert_eq!(store.get("rsNoCall").unwrap().as_deref(), Some("--")); - assert_eq!(store.get("rsPartialNoCall").unwrap().as_deref(), Some("--")); - assert_eq!(store.get("rsOutOfRange").unwrap(), None); - assert_eq!(store.get("rsEmptyRef").unwrap().as_deref(), Some(".G")); - assert_eq!(store.get("rsEmptyAlt").unwrap(), None); -} - -#[test] -fn extensionless_vcf_is_detected_by_content_and_can_be_forced() { - let dir = temp_dir("extensionless-vcf"); - let path = dir.join("sample.data"); - fs::write( - &path, - "##fileformat=VCFv4.2\n\ - ##reference=GRCh37\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - 1\t10\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - - let detected = GenotypeStore::from_file(&path).unwrap(); - assert_eq!(detected.backend_name(), "vcf"); - assert_eq!(detected.get("rs1").unwrap().as_deref(), Some("AG")); - - let forced = GenotypeStore::from_file_with_options( - &path, - &GenotypeLoadOptions { - format: Some(GenotypeSourceFormat::Vcf), - ..GenotypeLoadOptions::default() - }, - ) - .unwrap(); - assert_eq!(forced.backend_name(), "vcf"); - assert_eq!(forced.get("rs1").unwrap().as_deref(), Some("AG")); -} - -#[test] -fn vcf_file_lookup_handles_gt_field_order_no_calls_and_bad_positions() { - let dir = temp_dir("vcf-field-order"); - let path = dir.join("sample.vcf"); - fs::write( - &path, - "##fileformat=VCFv4.2\n\ - ##reference=GRCh38\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - 1\t10\trs1\tA\tG\t.\tPASS\t.\tDP:GT\t14:0|1\n\ - 1\t11\trs2\tC\tT\t.\tPASS\t.\tGT:DP\t./.:9\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&path).unwrap(); - assert_eq!(store.get("rs1").unwrap().as_deref(), Some("AG")); - assert_eq!(store.get("rs2").unwrap().as_deref(), Some("--")); - - let bad_path = dir.join("bad.vcf"); - fs::write( - &bad_path, - "##fileformat=VCFv4.2\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - 1\tnot-a-pos\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - let err = GenotypeStore::from_file(&bad_path) - .unwrap() - .get("rs1") - .unwrap_err(); - assert!( - format!("{err:?}").contains("failed to parse VCF position 'not-a-pos'"), - "{err:?}" - ); -} - -#[test] -fn genotype_store_from_bytes_handles_zip() { - let bytes = zip_bytes( - "nested/sample.txt", - b"rsid\tchromosome\tposition\tgenotype\nrs73885319\t22\t36265860\tAG\n", - ); - - let store = GenotypeStore::from_bytes("sample.zip", &bytes).unwrap(); - - assert_eq!(store.backend_name(), "zip"); - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); -} - -#[test] -fn rsid_map_batch_lookup_preserves_order_and_reports_missing_rsids() { - let store = GenotypeStore::from_bytes( - "sample.txt", - b"rsid\tchromosome\tposition\tgenotype\nrs2\t1\t20\tCT\nrs1\t1\t10\tAG\n", - ) - .unwrap(); - - let results = store - .lookup_variants(&[ - VariantSpec { - rsids: vec!["rs2".to_owned()], - ..VariantSpec::default() - }, - VariantSpec { - rsids: vec!["rsMissing".to_owned()], - ..VariantSpec::default() - }, - VariantSpec { - rsids: vec!["rs1".to_owned()], - ..VariantSpec::default() - }, - ]) - .unwrap(); - - assert_eq!(results[0].genotype.as_deref(), Some("CT")); - assert_eq!(results[1].genotype, None); - assert_eq!( - results[1].evidence, - vec!["no matching rsid found".to_owned()] - ); - assert_eq!(results[2].genotype.as_deref(), Some("AG")); -} - -#[test] -fn genotype_store_from_bytes_rejects_malformed_zip() { - let err = GenotypeStore::from_bytes("sample.zip", b"not a zip").unwrap_err(); - - assert!( - format!("{err:?}").contains("failed to read genotype zip sample.zip"), - "{err:?}" - ); -} - -#[test] -fn genotype_store_from_bytes_rejects_zip_without_supported_entry() { - let bytes = zip_bytes("notes.bin", b"not genotype data"); - - let err = GenotypeStore::from_bytes("sample.zip", &bytes).unwrap_err(); - - assert!( - format!("{err:?}") - .contains("zip archive sample.zip does not contain a supported genotype file"), - "{err:?}" - ); -} - -#[test] -fn alignment_index_parsers_handle_in_memory_bytes() { - let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); - let _repository = alignment::build_reference_repository_from_readers( - std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), - fai, - ); - - let err = alignment::parse_fai_bytes(b"not a fai").unwrap_err(); - assert!(format!("{err:?}").contains("failed to parse FASTA index bytes")); - - let err = alignment::parse_crai_bytes(b"not a crai").unwrap_err(); - assert!(format!("{err:?}").contains("failed to parse CRAM index bytes")); - - let err = alignment::parse_tbi_bytes(b"not a tbi").unwrap_err(); - assert!(format!("{err:?}").contains("failed to parse tabix index bytes")); -} - -#[test] -fn alignment_reader_api_reports_invalid_cram_headers_without_real_fixtures() { - let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); - let repository = alignment::build_reference_repository_from_readers( - std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), - fai, - ); - let locus = bioscript_core::GenomicLocus { - chrom: "chr1".to_owned(), - start: 1, - end: 1, - }; - let crai_bytes = fs::read(mini_fixtures_dir().join("mini.cram.crai")).unwrap(); - let mut reader = alignment::build_cram_indexed_reader_from_reader( - std::io::Cursor::new(b"not a cram".to_vec()), - alignment::parse_crai_bytes(&crai_bytes).unwrap(), - repository, - ) - .unwrap(); - - let err = - alignment::for_each_cram_record_with_reader(&mut reader, "bad.cram", &locus, |_| Ok(true)) - .unwrap_err(); - assert!( - format!("{err:?}").contains("failed to read CRAM header bad.cram"), - "{err:?}" - ); - - let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); - let repository = alignment::build_reference_repository_from_readers( - std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), - fai, - ); - let mut raw_reader = alignment::build_cram_indexed_reader_from_reader( - std::io::Cursor::new(b"still not a cram".to_vec()), - alignment::parse_crai_bytes(&crai_bytes).unwrap(), - repository, - ) - .unwrap(); - - let err = alignment::for_each_raw_cram_record_with_reader( - &mut raw_reader, - "raw-bad.cram", - &locus, - |_| Ok(true), - ) - .unwrap_err(); - assert!( - format!("{err:?}").contains("failed to read CRAM header raw-bad.cram"), - "{err:?}" - ); -} - -#[test] -fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { - let dir = temp_dir("csv-split-alleles"); - let path = dir.join("sample.csv"); - fs::write( - &path, - "\n\ - # rsid,chromosome,position,allele1,allele2\n\ - // ignored comment\n\ - rs73885319,chr22,36265860,a,g\n\ - rs60910145,22,36265900,n/a,\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&path).unwrap(); - - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); - assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); - - let observation = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "22".to_owned(), - start: 36_265_860, - end: 36_265_861, - }), - ..VariantSpec::default() - }) - .unwrap(); - assert_eq!(observation.genotype.as_deref(), Some("AG")); - assert_eq!( - observation.evidence, - vec!["resolved by locus chr22:36265860".to_owned()] - ); -} - -#[test] -fn delimited_parser_uses_comment_headers_aliases_quotes_and_extra_columns() { - let dir = temp_dir("comment-header-aliases"); - let path = dir.join("sample.csv"); - fs::write( - &path, - "# SNP ID, Chrom, Base Pair Position, Result, Ignored\n\ - \"rsQuoted\", \"chr3\", \"300\", \"a t\", \"unused, value\"\n\ - rsSlash,3,301,A/-,\n\ - rsNone,3,302,None,\n\ - no_position,3,,AG,\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&path).unwrap(); - - assert_eq!(store.get("rsQuoted").unwrap().as_deref(), Some("AT")); - assert_eq!(store.get("rsSlash").unwrap().as_deref(), Some("ID")); - assert_eq!(store.get("rsNone").unwrap().as_deref(), Some("--")); - assert_eq!(store.get("no_position").unwrap().as_deref(), Some("AG")); - let observation = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "3".to_owned(), - start: 300, - end: 300, - }), - ..VariantSpec::default() - }) - .unwrap(); - assert_eq!(observation.genotype.as_deref(), Some("AT")); -} - -#[test] -fn delimited_parser_handles_space_delimited_rows_without_headers_and_inline_comments() { - let dir = temp_dir("space-default-header"); - let path = dir.join("sample.txt"); - fs::write( - &path, - "\n\ - rsSpace chr2 200 tc # inline comment\n\ - chrOnly chr2 201 aa\n\ - badrow\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&path).unwrap(); - - assert_eq!(store.get("rsSpace").unwrap().as_deref(), Some("TC")); - let observation = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "2".to_owned(), - start: 201, - end: 201, - }), - ..VariantSpec::default() - }) - .unwrap(); - assert_eq!(observation.genotype.as_deref(), Some("AA")); - assert_eq!( - observation.evidence, - vec!["resolved by locus chr2:201".to_owned()] - ); -} - -#[test] -fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { - let dir = temp_dir("vcf-chr-normalize"); - let path = dir.join("sample.vcf"); - fs::write( - &path, - "##fileformat=VCFv4.2\n\ - ##reference=GRCh38\n\ - ##FORMAT=\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - chr1\t1000\t.\tA\tC,G\t.\tPASS\t.\tGT\t2/1\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&path).unwrap(); - let observation = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 1000, - end: 1001, - }), - reference: Some("A".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .unwrap(); - - assert_eq!(observation.genotype.as_deref(), Some("GC")); - assert_eq!(observation.assembly, Some(bioscript_core::Assembly::Grch38)); - assert_eq!( - observation.evidence, - vec!["resolved by locus chr1:1000".to_owned()] - ); -} - -#[test] -fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { - let dir = temp_dir("vcf-indel-locus"); - let path = dir.join("sample.hg19.vcf"); - fs::write( - &path, - "##fileformat=VCFv4.2\n\ - ##reference=hg19\n\ - ##FORMAT=\n\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ - 1\t99\t.\tAT\tA\t.\tPASS\t.\tGT\t0/1\n\ - chr1\t199\t.\tA\tATG\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&path).unwrap(); - let deletion = store - .lookup_variant(&VariantSpec { - grch37: Some(bioscript_core::GenomicLocus { - chrom: "chr1".to_owned(), - start: 100, - end: 100, - }), - reference: Some("AT".to_owned()), - alternate: Some("A".to_owned()), - kind: Some(VariantKind::Deletion), - deletion_length: Some(1), - ..VariantSpec::default() - }) - .unwrap(); - assert_eq!(deletion.genotype.as_deref(), Some("ID")); - assert_eq!(deletion.assembly, Some(bioscript_core::Assembly::Grch37)); - assert_eq!(deletion.evidence, vec!["resolved by locus 1:99".to_owned()]); - - let insertion = store - .lookup_variant(&VariantSpec { - grch37: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 200, - end: 200, - }), - reference: Some("A".to_owned()), - alternate: Some("ATG".to_owned()), - kind: Some(VariantKind::Insertion), - ..VariantSpec::default() - }) - .unwrap(); - assert_eq!(insertion.genotype.as_deref(), Some("DI")); - assert_eq!( - insertion.evidence, - vec!["resolved by locus chr1:199".to_owned()] - ); - - let unresolved = store - .lookup_variant(&VariantSpec { - grch37: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 300, - end: 300, - }), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .unwrap(); - assert_eq!(unresolved.genotype, None); - assert_eq!( - unresolved.evidence, - vec!["no matching rsid or locus found for variant_by_locus".to_owned()] - ); -} - -fn forced_cram_store(dir: &std::path::Path, reference_name: &str) -> GenotypeStore { - GenotypeStore::from_file_with_options( - &dir.join("missing.cram"), - &GenotypeLoadOptions { - format: Some(GenotypeSourceFormat::Cram), - reference_file: Some(dir.join(reference_name)), - reference_index: Some(dir.join(format!("{reference_name}.fai"))), - input_index: Some(dir.join("missing.cram.crai")), - allow_reference_md5_mismatch: false, - }, - ) - .unwrap() -} - -#[test] -fn forced_cram_backend_reports_reference_and_coordinate_errors_without_reading_cram() { - let dir = temp_dir("cram-reference-errors"); - let cram_path = dir.join("missing.cram"); - let store_without_reference = GenotypeStore::from_file_with_options( - &cram_path, - &GenotypeLoadOptions { - format: Some(GenotypeSourceFormat::Cram), - ..GenotypeLoadOptions::default() - }, - ) - .unwrap(); - let err = store_without_reference - .lookup_variant(&VariantSpec { - rsids: vec!["rs1".to_owned()], - ..VariantSpec::default() - }) - .unwrap_err(); - assert!( - format!("{err:?}").contains("without --reference-file"), - "{err:?}" - ); - - let store = forced_cram_store(&dir, "GRCh38.fa"); - let err = store - .lookup_variant(&VariantSpec { - rsids: vec!["rs1".to_owned()], - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("needs GRCh37/GRCh38 coordinates")); - assert!(format!("{err:?}").contains("reference index")); - assert!(format!("{err:?}").contains("input index")); -} - -#[test] -fn forced_cram_backend_reports_snp_and_indel_argument_errors_without_reading_cram() { - let dir = temp_dir("cram-variant-argument-errors"); - let store = forced_cram_store(&dir, "GRCh38.fa"); - let err = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - kind: Some(VariantKind::Snp), - alternate: Some("G".to_owned()), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("SNP variant requires ref/reference")); - - let err = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - reference: Some("A".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("SNP variant requires alt/alternate")); - - let err = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - kind: Some(VariantKind::Deletion), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("deletion variant requires deletion_length")); - - let err = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - kind: Some(VariantKind::Indel), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("indel variant requires ref/reference")); - - let err = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - reference: Some("A".to_owned()), - kind: Some(VariantKind::Insertion), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("indel variant requires alt/alternate")); -} - -#[test] -fn forced_cram_backend_reports_file_and_assembly_errors_without_reading_cram() { - let dir = temp_dir("cram-file-assembly-errors"); - let store = forced_cram_store(&dir, "GRCh38.fa"); - let err = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - reference: Some("A".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!( - format!("{err:?}").contains("failed to open indexed FASTA"), - "{err:?}" - ); - - let err = store - .lookup_variant(&VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - reference: Some("A".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Other), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("does not yet support Other")); - - let hg19_store = forced_cram_store(&dir, "hg19.fa"); - let err = hg19_store - .lookup_variant(&VariantSpec { - grch37: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - kind: Some(VariantKind::Other), - ..VariantSpec::default() - }) - .unwrap_err(); - assert!(format!("{err:?}").contains("does not yet support Other")); -} - -#[test] -fn batch_lookup_preserves_input_order_after_coordinate_sorting() { - let dir = temp_dir("batch-order"); - let path = dir.join("sample.txt"); - fs::write( - &path, - "rsid\tchromosome\tposition\tgenotype\n\ - rs2\t1\t20\tCT\n\ - rs1\t1\t10\tAG\n", - ) - .unwrap(); - let store = GenotypeStore::from_file(&path).unwrap(); - - let results = store - .lookup_variants(&[ - VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 20, - end: 20, - }), - ..VariantSpec::default() - }, - VariantSpec { - grch38: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 10, - end: 10, - }), - ..VariantSpec::default() - }, - ]) - .unwrap(); - - assert_eq!(results[0].genotype.as_deref(), Some("CT")); - assert_eq!( - results[0].evidence, - vec!["resolved by locus 1:20".to_owned()] - ); - assert_eq!(results[1].genotype.as_deref(), Some("AG")); - assert_eq!( - results[1].evidence, - vec!["resolved by locus 1:10".to_owned()] - ); -} - -#[test] -fn zip_genotype_file_is_auto_detected_and_readable() { - let dir = temp_dir("zip-auto"); - let zip_path = dir.join("apol1-input.zip"); - - let file = fs::File::create(&zip_path).unwrap(); - let mut writer = zip::ZipWriter::new(file); - writer - .start_file("test_snps.txt", SimpleFileOptions::default()) - .unwrap(); - writer - .write_all( - b"rsid\tchromosome\tposition\tgenotype\nrs73885319\t22\t1\tAG\nrs60910145\t22\t2\tTG\nrs71785313\t22\t3\tII\n", - ) - .unwrap(); - writer.finish().unwrap(); - - let store = GenotypeStore::from_file(&zip_path).unwrap(); - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); - assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("TG")); - assert_eq!(store.get("rs71785313").unwrap().as_deref(), Some("II")); -} - -#[test] -fn zip_genotype_file_can_be_forced_by_format() { - let dir = temp_dir("zip-forced"); - let zip_path = dir.join("apol1-input.dat"); - - let file = fs::File::create(&zip_path).unwrap(); - let mut writer = zip::ZipWriter::new(file); - writer - .start_file("test_snps.txt", SimpleFileOptions::default()) - .unwrap(); - writer - .write_all(b"rsid\tchromosome\tposition\tgenotype\nrs73885319\t22\t1\tAG\n") - .unwrap(); - writer.finish().unwrap(); - - let store = GenotypeStore::from_file_with_options( - &zip_path, - &GenotypeLoadOptions { - format: Some(GenotypeSourceFormat::Zip), - ..GenotypeLoadOptions::default() - }, - ) - .unwrap(); - - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); -} - -#[test] -fn zip_vcf_entry_is_auto_detected_and_readable() { - let dir = temp_dir("zip-vcf"); - let zip_path = dir.join("apol1-sample.zip"); - - let file = fs::File::create(&zip_path).unwrap(); - let mut writer = zip::ZipWriter::new(file); - writer - .start_file("nested/sample.vcf", SimpleFileOptions::default()) - .unwrap(); - writer - .write_all( - b"##fileformat=VCFv4.2\n\ -##FORMAT=\n\ -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ -22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - writer.finish().unwrap(); - - let store = GenotypeStore::from_file(&zip_path).unwrap(); - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); -} - -#[test] -fn zip_vcf_gz_entry_is_selected_and_read_as_vcf() { - let dir = temp_dir("zip-vcf-gz-entry"); - let zip_path = dir.join("sample.zip"); - - let file = fs::File::create(&zip_path).unwrap(); - let mut writer = zip::ZipWriter::new(file); - writer - .add_directory("nested/", SimpleFileOptions::default()) - .unwrap(); - writer - .start_file("nested/sample.vcf.gz", SimpleFileOptions::default()) - .unwrap(); - writer - .write_all( - b"##fileformat=VCFv4.2\n\ -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ -2\t22\trsZipVcfGz\tG\tA\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - writer.finish().unwrap(); - - let store = GenotypeStore::from_file(&zip_path).unwrap(); - assert_eq!(store.backend_name(), "vcf"); - assert_eq!(store.get("rsZipVcfGz").unwrap().as_deref(), Some("GA")); -} - -#[test] -fn shared_real_world_zipped_genotype_exports_are_readable() { - struct FixtureExpectation { - relative: &'static str, - rsid: &'static str, - genotype: &'static str, - } - - let fixtures = [ - FixtureExpectation { - relative: "23andme/v2/hu0199C8/23data20100526.txt.zip", - rsid: "rs3094315", - genotype: "AA", - }, - FixtureExpectation { - relative: "23andme/v3/huE4DAE4/huE4DAE4_20120522224129.txt.zip", - rsid: "rs3131972", - genotype: "GG", - }, - FixtureExpectation { - relative: "23andme/v4/huE18D82/genome__v4_Full_2016.txt.zip", - rsid: "rs3131972", - genotype: "AG", - }, - FixtureExpectation { - relative: "23andme/v5/hu50B3F5/genome_hu50B3F5_v5_Full.zip", - rsid: "rs116587930", - genotype: "GG", - }, - FixtureExpectation { - relative: "dynamicdna/100001-synthetic/100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip", - rsid: "rs116587930", - genotype: "GG", - }, - FixtureExpectation { - relative: "ancestrydna/huE922FC/AncestryDNA.txt.zip", - rsid: "rs3131972", - genotype: "GG", - }, - FixtureExpectation { - relative: "familytreedna/hu17B792/2017-04-29_Family_Tree_DNA_Data.csv.zip", - rsid: "rs1000530", - genotype: "TT", - }, - FixtureExpectation { - relative: "genesforgood/hu80B047/GFG0_filtered_imputed_genotypes_noY_noMT_23andMe.txt.zip", - rsid: "rs3094315", - genotype: "AA", - }, - FixtureExpectation { - relative: "myheritage/hu33515F/MyHeritage_raw_dna_data.zip", - rsid: "rs3131972", - genotype: "GG", - }, - ]; - - for fixture in fixtures { - let Some(path) = shared_fixture_or_skip( - "shared_real_world_zipped_genotype_exports_are_readable", - fixture.relative, - ) else { - return; - }; - - let store = GenotypeStore::from_file(&path).unwrap(); - assert_eq!( - store.get(fixture.rsid).unwrap().as_deref(), - Some(fixture.genotype), - "fixture {}", - fixture.relative - ); - } -} - -#[test] -fn bundled_dynamicdna_gsav3_plain_text_fixture_is_readable() { - let path = repo_root() - .join("old/examples/apol1/genotype_files/108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt"); - let store = GenotypeStore::from_file(&path).unwrap(); - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AA")); -} - -#[test] -fn vcf_variant_lookup_reads_single_sample_calls() { - let dir = temp_dir("vcf"); - let vcf_path = dir.join("apol1_sample.vcf"); - fs::write( - &vcf_path, - "##fileformat=VCFv4.2\n\ -##FORMAT=\n\ -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ -22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ -22\t36265900\trs60910145\tT\tG\t.\tPASS\t.\tGT\t1/1\n\ -22\t36266005\trs71785313\tA\tATTTAA\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&vcf_path).unwrap(); - assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); - assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("GG")); - - let observation = store - .lookup_variant(&VariantSpec { - rsids: vec!["rs71785313".to_owned()], - kind: Some(VariantKind::Insertion), - ..VariantSpec::default() - }) - .unwrap(); - assert_eq!(observation.genotype.as_deref(), Some("DI")); -} - -#[test] -fn vcf_variant_lookup_ignores_symbolic_non_ref_alt_when_decoding_gt() { - let dir = temp_dir("vcf-non-ref"); - let vcf_path = dir.join("sample.g.vcf"); - fs::write( - &vcf_path, - "##fileformat=VCFv4.2\n\ -##FORMAT=\n\ -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ -6\t39016636\trs10305420\tC\tT,\t.\tPASS\t.\tGT\t0/1\n\ -6\t38979128\trs9357296\tA\tG,\t.\tPASS\t.\tGT\t0/1\n", - ) - .unwrap(); - - let store = GenotypeStore::from_file(&vcf_path).unwrap(); - assert_eq!(store.get("rs10305420").unwrap().as_deref(), Some("CT")); - assert_eq!(store.get("rs9357296").unwrap().as_deref(), Some("AG")); -} - -#[test] -fn real_world_clean_vcf_supports_locus_lookup_without_rsids() { - let test_name = "real_world_clean_vcf_supports_locus_lookup_without_rsids"; - let Some(clean_vcf) = shared_fixture_or_skip(test_name, "1k-genomes/vcf/NA06985.clean.vcf.gz") - else { - return; - }; - let Some(original_vcf) = shared_fixture_or_skip(test_name, "1k-genomes/vcf/NA06985.vcf.gz") - else { - return; - }; - - let clean_store = GenotypeStore::from_file(&clean_vcf).expect("open cleaned VCF"); - let original_store = GenotypeStore::from_file(&original_vcf).expect("open original VCF"); - - let queries = [ - ( - VariantSpec { - grch37: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 12_783, - end: 12_783, - }), - reference: Some("G".to_owned()), - alternate: Some("A".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }, - "GA", - ), - ( - VariantSpec { - grch37: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 13_110, - end: 13_110, - }), - reference: Some("G".to_owned()), - alternate: Some("A".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }, - "GA", - ), - ( - VariantSpec { - rsids: vec!["rs78601809".to_owned()], - grch37: Some(bioscript_core::GenomicLocus { - chrom: "1".to_owned(), - start: 15_211, - end: 15_211, - }), - reference: Some("T".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }, - "GG", - ), - ]; - - for (query, expected_genotype) in queries { - let clean = clean_store.lookup_variant(&query).expect("clean lookup"); - let original = original_store - .lookup_variant(&query) - .expect("original lookup"); - - assert_eq!(clean.backend, "vcf"); - assert_eq!(clean.genotype.as_deref(), Some(expected_genotype)); - assert_eq!(original.genotype, clean.genotype); - } -} - -struct CramFixture { - cram: PathBuf, - reference: PathBuf, - reference_index: PathBuf, - input_index: PathBuf, -} - -fn run_large_cram_tests() -> bool { - env::var_os("BIOSCRIPT_RUN_LARGE_TESTS").is_some() -} - -fn require_large_cram_tests(test_name: &str) -> bool { - if run_large_cram_tests() { - true - } else { - eprintln!("skipping {test_name}: set BIOSCRIPT_RUN_LARGE_TESTS=1 to enable"); - false - } -} - -fn cram_fixture_or_skip(test_name: &str) -> Option { - if !require_large_cram_tests(test_name) { - return None; - } - let root = shared_test_data_root()?; - let fx = CramFixture { - cram: root.join("1k-genomes/aligned/NA06985.final.cram"), - reference: root.join("1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"), - reference_index: root.join("1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"), - input_index: root.join("1k-genomes/aligned/NA06985.final.cram.crai"), - }; - for p in [ - &fx.cram, - &fx.reference, - &fx.reference_index, - &fx.input_index, - ] { - if !p.exists() { - eprintln!("skipping {test_name}: missing {}", p.display()); - return None; - } - } - Some(fx) -} - -fn chr_y_cram_fixture_or_skip(test_name: &str) -> Option { - let root = shared_test_data_root()?; - let fx = CramFixture { - cram: root.join("NA06985-chrY/aligned/NA06985.final.chrY.cram"), - reference: root.join("NA06985-chrY/ref/GRCh38_chrY.fa"), - reference_index: root.join("NA06985-chrY/ref/GRCh38_chrY.fa.fai"), - input_index: root.join("NA06985-chrY/aligned/NA06985.final.chrY.cram.crai"), - }; - for p in [ - &fx.cram, - &fx.reference, - &fx.reference_index, - &fx.input_index, - ] { - if !p.exists() { - eprintln!("skipping {test_name}: missing {}", p.display()); - return None; - } - } - Some(fx) -} - -fn open_cram_store(fx: &CramFixture) -> GenotypeStore { - open_cram_store_with_md5_policy(fx, false) -} - -fn open_cram_store_with_md5_policy( - fx: &CramFixture, - allow_reference_md5_mismatch: bool, -) -> GenotypeStore { - GenotypeStore::from_file_with_options( - &fx.cram, - &GenotypeLoadOptions { - format: Some(GenotypeSourceFormat::Cram), - input_index: Some(fx.input_index.clone()), - reference_file: Some(fx.reference.clone()), - reference_index: Some(fx.reference_index.clone()), - allow_reference_md5_mismatch, - }, - ) - .expect("open cram store") -} - -fn mini_fixtures_dir() -> PathBuf { - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures") -} - -fn mini_cram_fixture() -> CramFixture { - let dir = mini_fixtures_dir(); - CramFixture { - cram: dir.join("mini.cram"), - reference: dir.join("mini.fa"), - reference_index: dir.join("mini.fa.fai"), - input_index: dir.join("mini.cram.crai"), - } -} - -fn mini_cram_fixture_with_bad_ref() -> CramFixture { - let dir = mini_fixtures_dir(); - CramFixture { - cram: dir.join("mini.cram"), - reference: dir.join("mini_bad_ref.fa"), - reference_index: dir.join("mini_bad_ref.fa.fai"), - input_index: dir.join("mini.cram.crai"), - } -} - -#[test] -fn cram_mini_fixture_streams_only_locus_overlapping_reads() { - // mini.cram has 2000 reads covering chr_test:500..2499. The streaming path - // should decode roughly until it passes the locus and stop — correctness is - // asserted via depth (exactly 50 reads overlap a single base in the middle). - // If the streaming + early-termination path breaks and falls back to full - // slice decode, the wall time still finishes fine on 2000 reads but this - // test also catches regressions that double-count or miss reads. - let fx = mini_cram_fixture(); - let store = open_cram_store(&fx); - - let start = std::time::Instant::now(); - let observation = store - .lookup_variant(&VariantSpec { - rsids: vec!["mini_locus_1000".to_owned()], - grch38: Some(bioscript_core::GenomicLocus { - chrom: "chr_test".to_owned(), - start: 1000, - end: 1000, - }), - reference: Some("A".to_owned()), - alternate: Some("C".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .expect("mini cram lookup"); - let elapsed = start.elapsed(); - - assert_eq!(observation.backend, "cram"); - assert_eq!( - observation.depth.unwrap_or(0), - 50, - "expected exactly 50 reads overlapping chr_test:1000, got {:?}", - observation.depth - ); - // All reads match reference in the fixture so alt_count should be zero. - assert_eq!(observation.ref_count.unwrap_or(0), 50); - assert_eq!(observation.alt_count.unwrap_or(0), 0); - assert!( - elapsed.as_millis() < 2000, - "mini CRAM lookup took {elapsed:?}, expected well under 2s" - ); -} - -#[test] -fn cram_mini_fixture_md5_mismatch_is_tolerated_when_allowed() { - // mini_bad_ref.fa has a single-base mutation at chr_test:2800, inside the - // slice span but far from our query locus at 1000. noodles' strict MD5 - // check will fail; bioscript must warn + retry unchecked + still return - // the correct genotype (the bases at pos 1000 are identical in both refs). - let fx = mini_cram_fixture_with_bad_ref(); - let store = open_cram_store_with_md5_policy(&fx, true); - - let observation = store - .lookup_variant(&VariantSpec { - rsids: vec!["mini_locus_1000".to_owned()], - grch38: Some(bioscript_core::GenomicLocus { - chrom: "chr_test".to_owned(), - start: 1000, - end: 1000, - }), - reference: Some("A".to_owned()), - alternate: Some("C".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .expect("mini cram lookup should succeed via md5 fallback"); - - assert_eq!(observation.backend, "cram"); - assert_eq!( - observation.depth.unwrap_or(0), - 50, - "expected exactly 50 reads after md5 fallback, got {:?}", - observation.depth - ); - // Bases at the query locus are the same in both references, so the - // fallback-decoded reads should still be ref-homozygous. - assert_eq!(observation.ref_count.unwrap_or(0), 50); - assert_eq!(observation.alt_count.unwrap_or(0), 0); -} - -#[test] -fn cram_chr_y_fixture_lookup_is_fast_and_correct() { - let Some(fx) = chr_y_cram_fixture_or_skip("cram_chr_y_fixture_lookup_is_fast_and_correct") - else { - return; - }; - let store = open_cram_store(&fx); - - let start = std::time::Instant::now(); - let observation = store - .lookup_variant(&VariantSpec { - rsids: vec!["chrY_smoke_3449570".to_owned()], - grch38: Some(bioscript_core::GenomicLocus { - chrom: "chrY".to_owned(), - start: 3_449_570, - end: 3_449_570, - }), - reference: Some("A".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .expect("chrY lookup"); - let elapsed = start.elapsed(); - - assert_eq!(observation.backend, "cram"); - let depth = observation.depth.unwrap_or(0); - assert!( - depth >= 8, - "expected >=8 reads at chrY smoke locus, got {depth}" - ); - assert_eq!(observation.alt_count.unwrap_or(0), 0); - assert!( - observation.ref_count.unwrap_or(0) >= 8, - "expected ref-supporting reads at chrY smoke locus, got {:?}", - observation.ref_count - ); - assert!( - elapsed.as_secs() < 5, - "chrY CRAM lookup took {elapsed:?}, expected <5s" - ); -} - -#[test] -fn cram_apol1_snp_lookup_is_fast_and_correct() { - let Some(fx) = cram_fixture_or_skip("cram_apol1_snp_lookup_is_fast_and_correct") else { - return; - }; - let store = open_cram_store(&fx); - - let start = std::time::Instant::now(); - let observation = store - .lookup_variant(&VariantSpec { - rsids: vec!["rs73885319".to_owned()], - grch38: Some(bioscript_core::GenomicLocus { - chrom: "22".to_owned(), - start: 36_265_860, - end: 36_265_860, - }), - reference: Some("A".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .expect("apol1 lookup"); - let elapsed = start.elapsed(); - - assert_eq!(observation.backend, "cram"); - // NA06985 is reference-homozygous at APOL1 G1 site 1 per samtools mpileup. - let depth = observation.depth.unwrap_or(0); - assert!( - depth >= 10, - "expected >=10 reads at APOL1 locus, got {depth}" - ); - let ref_count = observation.ref_count.unwrap_or(0); - let alt_count = observation.alt_count.unwrap_or(0); - assert!( - ref_count > alt_count, - "NA06985 APOL1 G1 site 1 should be ref-dominant: ref={ref_count} alt={alt_count}" - ); - - // Slice-level CRAM decode is the hot path. Samtools does the same locus - // in ~40ms; we allow a generous ceiling to catch regressions (e.g. if the - // streaming/early-termination path breaks and we fall back to decoding - // every record in the slice, this blows past 10s). - assert!( - elapsed.as_secs() < 5, - "APOL1 CRAM lookup took {elapsed:?}, expected <5s (samtools does it in ~40ms)" - ); -} - -#[test] -fn cram_md5_mismatch_is_tolerated_and_returns_correct_result() { - // For NA06985.final.cram, the bundled GRCh38 FASTA's chr6 MD5 does not - // match the @SQ M5 the CRAM was encoded against (only chr22 matches). - // We must warn + fall back to unchecked decoding, and still return the - // correct genotype at the GLP1 rs10305420 locus. The correct call per - // samtools mpileup is reference-homozygous (CC). - let Some(fx) = - cram_fixture_or_skip("cram_md5_mismatch_is_tolerated_and_returns_correct_result") - else { - return; - }; - let store = open_cram_store_with_md5_policy(&fx, true); - - let observation = store - .lookup_variant(&VariantSpec { - rsids: vec!["rs10305420".to_owned()], - grch38: Some(bioscript_core::GenomicLocus { - chrom: "6".to_owned(), - start: 39_048_860, - end: 39_048_860, - }), - reference: Some("C".to_owned()), - alternate: Some("T".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .expect("glp1 lookup should succeed via md5 fallback"); - - assert_eq!(observation.backend, "cram"); - let depth = observation.depth.unwrap_or(0); - assert!( - depth >= 10, - "expected >=10 reads at GLP1 locus after md5 fallback, got {depth}" - ); - let genotype = observation - .genotype - .as_deref() - .expect("expected a genotype call"); - assert!( - genotype.chars().all(|c| c == 'C' || c == 'T'), - "unexpected genotype after md5 fallback: {genotype}" - ); - - // Parity with `samtools mpileup -f -r chr6:39048860-39048860`: - // that locus shows a mixed pileup (roughly half reference C, half T). - // We assert total depth matches samtools' reported depth within a small - // tolerance — confirms we are not silently dropping or duplicating reads - // after the unchecked-reference fallback. - let depth_i32 = i32::try_from(depth).unwrap_or(i32::MAX); - let samtools_depth: i32 = 41; - assert!( - (depth_i32 - samtools_depth).abs() <= 6, - "depth {depth_i32} differs from samtools mpileup depth {samtools_depth} by >6" - ); -} - -#[test] -fn cram_rs9357296_reports_heterozygous_counts_for_na06985() { - let Some(fx) = cram_fixture_or_skip("cram_rs9357296_reports_heterozygous_counts_for_na06985") - else { - return; - }; - let store = open_cram_store_with_md5_policy(&fx, true); - - let observation = store - .lookup_variant(&VariantSpec { - rsids: vec!["rs9357296".to_owned()], - grch38: Some(bioscript_core::GenomicLocus { - chrom: "6".to_owned(), - start: 39_011_352, - end: 39_011_352, - }), - reference: Some("A".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Snp), - ..VariantSpec::default() - }) - .expect("rs9357296 lookup"); - - assert_eq!(observation.backend, "cram"); - assert_eq!(observation.genotype.as_deref(), Some("AG")); - assert_eq!(observation.raw_counts.get("A").copied(), Some(18)); - assert_eq!(observation.raw_counts.get("G").copied(), Some(12)); - assert_eq!(observation.raw_counts.get("T").copied(), None); - assert_eq!(observation.depth, Some(29)); - assert_eq!(observation.ref_count, Some(17)); - assert_eq!(observation.alt_count, Some(12)); - assert!( - observation - .decision - .as_deref() - .is_some_and(|text| text.contains("alt_fraction=0.414")), - "missing SNP decision summary: {:?}", - observation.decision - ); - assert!( - observation - .evidence - .iter() - .any(|line| line.contains("raw pileup depth=30")), - "missing raw pileup evidence: {:?}", - observation.evidence - ); - assert!( - observation.evidence.iter().any(|line| { - line.contains("filtered_duplicate=4") - && line.contains("filtered_low_base_quality=1") - && line.contains("filtered_improper_pair=0") - }), - "missing mpileup-style filter evidence: {:?}", - observation.evidence - ); -} +#[path = "file_formats/alignment.rs"] +mod alignment_tests; +#[path = "file_formats/basic.rs"] +mod basic; +#[path = "file_formats/cram.rs"] +mod cram; +#[path = "file_formats/delimited.rs"] +mod delimited; +#[path = "file_formats/vcf.rs"] +mod vcf; +#[path = "file_formats/zip_and_fixtures.rs"] +mod zip_and_fixtures; diff --git a/rust/bioscript-formats/tests/file_formats/alignment.rs b/rust/bioscript-formats/tests/file_formats/alignment.rs new file mode 100644 index 0000000..2e00777 --- /dev/null +++ b/rust/bioscript-formats/tests/file_formats/alignment.rs @@ -0,0 +1,76 @@ +use super::*; + +fn mini_fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures") +} + +#[test] +fn alignment_index_parsers_handle_in_memory_bytes() { + let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); + let _repository = alignment::build_reference_repository_from_readers( + std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), + fai, + ); + + let err = alignment::parse_fai_bytes(b"not a fai").unwrap_err(); + assert!(format!("{err:?}").contains("failed to parse FASTA index bytes")); + + let err = alignment::parse_crai_bytes(b"not a crai").unwrap_err(); + assert!(format!("{err:?}").contains("failed to parse CRAM index bytes")); + + let err = alignment::parse_tbi_bytes(b"not a tbi").unwrap_err(); + assert!(format!("{err:?}").contains("failed to parse tabix index bytes")); +} + +#[test] +fn alignment_reader_api_reports_invalid_cram_headers_without_real_fixtures() { + let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); + let repository = alignment::build_reference_repository_from_readers( + std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), + fai, + ); + let locus = bioscript_core::GenomicLocus { + chrom: "chr1".to_owned(), + start: 1, + end: 1, + }; + let crai_bytes = fs::read(mini_fixtures_dir().join("mini.cram.crai")).unwrap(); + let mut reader = alignment::build_cram_indexed_reader_from_reader( + std::io::Cursor::new(b"not a cram".to_vec()), + alignment::parse_crai_bytes(&crai_bytes).unwrap(), + repository, + ) + .unwrap(); + + let err = + alignment::for_each_cram_record_with_reader(&mut reader, "bad.cram", &locus, |_| Ok(true)) + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to read CRAM header bad.cram"), + "{err:?}" + ); + + let fai = alignment::parse_fai_bytes(b"chr1\t4\t6\t4\t5\n").unwrap(); + let repository = alignment::build_reference_repository_from_readers( + std::io::BufReader::new(std::io::Cursor::new(b">chr1\nACGT\n".to_vec())), + fai, + ); + let mut raw_reader = alignment::build_cram_indexed_reader_from_reader( + std::io::Cursor::new(b"still not a cram".to_vec()), + alignment::parse_crai_bytes(&crai_bytes).unwrap(), + repository, + ) + .unwrap(); + + let err = alignment::for_each_raw_cram_record_with_reader( + &mut raw_reader, + "raw-bad.cram", + &locus, + |_| Ok(true), + ) + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to read CRAM header raw-bad.cram"), + "{err:?}" + ); +} diff --git a/rust/bioscript-formats/tests/file_formats/basic.rs b/rust/bioscript-formats/tests/file_formats/basic.rs new file mode 100644 index 0000000..8dca8f8 --- /dev/null +++ b/rust/bioscript-formats/tests/file_formats/basic.rs @@ -0,0 +1,269 @@ +use super::*; + +#[test] +fn genotype_store_from_bytes_handles_genotype_text() { + let store = GenotypeStore::from_bytes( + "sample.txt", + b"\xef\xbb\xbfrsid\tchromosome\tposition\tgenotype\n\ + # skipped comment\n\ + rs73885319\t22\t36265860\tag\n\ + rs60910145\t22\t36265900\tN/A\n", + ) + .unwrap(); + + assert_eq!(store.backend_name(), "text"); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); +} + +#[test] +fn genotype_source_format_parses_supported_values_and_rejects_unknowns() { + assert_eq!( + "txt".parse::().unwrap(), + GenotypeSourceFormat::Text + ); + assert_eq!( + "GENOTYPE".parse::().unwrap(), + GenotypeSourceFormat::Text + ); + assert_eq!( + "zip".parse::().unwrap(), + GenotypeSourceFormat::Zip + ); + assert_eq!( + "vcf".parse::().unwrap(), + GenotypeSourceFormat::Vcf + ); + assert_eq!( + "cram".parse::().unwrap(), + GenotypeSourceFormat::Cram + ); + + let err = "bam".parse::().unwrap_err(); + assert_eq!(err, "unsupported input format: bam"); +} + +#[test] +fn backend_capabilities_match_query_backend_type() { + let rsid_map = GenotypeStore::from_bytes( + "sample.txt", + b"rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + assert_eq!(rsid_map.backend_name(), "text"); + assert!(rsid_map.supports(QueryKind::GenotypeByRsid)); + assert!(!rsid_map.supports(QueryKind::GenotypeByLocus)); + + let dir = temp_dir("backend-capabilities"); + let text_path = dir.join("sample.txt"); + fs::write( + &text_path, + "rsid\tchromosome\tposition\tgenotype\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + let delimited = GenotypeStore::from_file(&text_path).unwrap(); + assert_eq!(delimited.backend_name(), "text"); + assert!(delimited.supports(QueryKind::GenotypeByRsid)); + assert!(delimited.supports(QueryKind::GenotypeByLocus)); + + let vcf_path = dir.join("sample.vcf"); + fs::write( + &vcf_path, + "##fileformat=VCFv4.2\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + let vcf = GenotypeStore::from_file(&vcf_path).unwrap(); + assert_eq!(vcf.backend_name(), "vcf"); + assert!(vcf.supports(QueryKind::GenotypeByRsid)); + assert!(vcf.supports(QueryKind::GenotypeByLocus)); + + let cram = GenotypeStore::from_file_with_options( + &dir.join("sample.dat"), + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + assert_eq!(cram.backend_name(), "cram"); + assert!(!cram.supports(QueryKind::GenotypeByRsid)); + assert!(cram.supports(QueryKind::GenotypeByLocus)); +} + +#[test] +fn genotype_store_from_bytes_handles_vcf() { + let store = GenotypeStore::from_bytes( + "sample.vcf", + b"##fileformat=VCFv4.2\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + assert_eq!(store.backend_name(), "vcf"); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn vcf_bytes_skip_unusable_rows_and_decode_no_call_forms() { + let store = GenotypeStore::from_bytes( + "sample.vcf", + b"##fileformat=VCFv4.2\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\t.\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ + 1\t11\trsEmptyRef\t.\tG\t.\tPASS\t.\tGT\t0/1\n\ + 1\t12\trsEmptyAlt\tA\t.\t.\tPASS\t.\tGT\t0/1\n\ + 1\t13\trsShort\tA\tG\n\ + 1\t14\trsNoCall\tA\tG\t.\tPASS\t.\tGT\t.\n\ + 1\t15\trsPartialNoCall\tA\tG\t.\tPASS\t.\tGT\t./1\n\ + 1\t16\trsOutOfRange\tA\tG\t.\tPASS\t.\tGT\t0/2\n\ + 1\t17\trsValid\tC\tT\t.\tPASS\t.\tGT\t1|1\n", + ) + .unwrap(); + + assert_eq!(store.backend_name(), "vcf"); + assert_eq!(store.get("rsValid").unwrap().as_deref(), Some("TT")); + assert_eq!(store.get("rsNoCall").unwrap().as_deref(), Some("--")); + assert_eq!(store.get("rsPartialNoCall").unwrap().as_deref(), Some("--")); + assert_eq!(store.get("rsOutOfRange").unwrap(), None); + assert_eq!(store.get("rsEmptyRef").unwrap().as_deref(), Some(".G")); + assert_eq!(store.get("rsEmptyAlt").unwrap(), None); +} + +#[test] +fn extensionless_vcf_is_detected_by_content_and_can_be_forced() { + let dir = temp_dir("extensionless-vcf"); + let path = dir.join("sample.data"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=GRCh37\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let detected = GenotypeStore::from_file(&path).unwrap(); + assert_eq!(detected.backend_name(), "vcf"); + assert_eq!(detected.get("rs1").unwrap().as_deref(), Some("AG")); + + let forced = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Vcf), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + assert_eq!(forced.backend_name(), "vcf"); + assert_eq!(forced.get("rs1").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn vcf_file_lookup_handles_gt_field_order_no_calls_and_bad_positions() { + let dir = temp_dir("vcf-field-order"); + let path = dir.join("sample.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=GRCh38\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs1\tA\tG\t.\tPASS\t.\tDP:GT\t14:0|1\n\ + 1\t11\trs2\tC\tT\t.\tPASS\t.\tGT:DP\t./.:9\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + assert_eq!(store.get("rs1").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs2").unwrap().as_deref(), Some("--")); + + let bad_path = dir.join("bad.vcf"); + fs::write( + &bad_path, + "##fileformat=VCFv4.2\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\tnot-a-pos\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + let err = GenotypeStore::from_file(&bad_path) + .unwrap() + .get("rs1") + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to parse VCF position 'not-a-pos'"), + "{err:?}" + ); +} + +#[test] +fn genotype_store_from_bytes_handles_zip() { + let bytes = zip_bytes( + "nested/sample.txt", + b"rsid\tchromosome\tposition\tgenotype\nrs73885319\t22\t36265860\tAG\n", + ); + + let store = GenotypeStore::from_bytes("sample.zip", &bytes).unwrap(); + + assert_eq!(store.backend_name(), "zip"); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn rsid_map_batch_lookup_preserves_order_and_reports_missing_rsids() { + let store = GenotypeStore::from_bytes( + "sample.txt", + b"rsid\tchromosome\tposition\tgenotype\nrs2\t1\t20\tCT\nrs1\t1\t10\tAG\n", + ) + .unwrap(); + + let results = store + .lookup_variants(&[ + VariantSpec { + rsids: vec!["rs2".to_owned()], + ..VariantSpec::default() + }, + VariantSpec { + rsids: vec!["rsMissing".to_owned()], + ..VariantSpec::default() + }, + VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }, + ]) + .unwrap(); + + assert_eq!(results[0].genotype.as_deref(), Some("CT")); + assert_eq!(results[1].genotype, None); + assert_eq!( + results[1].evidence, + vec!["no matching rsid found".to_owned()] + ); + assert_eq!(results[2].genotype.as_deref(), Some("AG")); +} + +#[test] +fn genotype_store_from_bytes_rejects_malformed_zip() { + let err = GenotypeStore::from_bytes("sample.zip", b"not a zip").unwrap_err(); + + assert!( + format!("{err:?}").contains("failed to read genotype zip sample.zip"), + "{err:?}" + ); +} + +#[test] +fn genotype_store_from_bytes_rejects_zip_without_supported_entry() { + let bytes = zip_bytes("notes.bin", b"not genotype data"); + + let err = GenotypeStore::from_bytes("sample.zip", &bytes).unwrap_err(); + + assert!( + format!("{err:?}") + .contains("zip archive sample.zip does not contain a supported genotype file"), + "{err:?}" + ); +} diff --git a/rust/bioscript-formats/tests/file_formats/cram.rs b/rust/bioscript-formats/tests/file_formats/cram.rs new file mode 100644 index 0000000..cb912a2 --- /dev/null +++ b/rust/bioscript-formats/tests/file_formats/cram.rs @@ -0,0 +1,574 @@ +use super::*; + +struct CramFixture { + cram: PathBuf, + reference: PathBuf, + reference_index: PathBuf, + input_index: PathBuf, +} + +fn forced_cram_store(dir: &std::path::Path, reference_name: &str) -> GenotypeStore { + GenotypeStore::from_file_with_options( + &dir.join("missing.cram"), + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + reference_file: Some(dir.join(reference_name)), + reference_index: Some(dir.join(format!("{reference_name}.fai"))), + input_index: Some(dir.join("missing.cram.crai")), + allow_reference_md5_mismatch: false, + }, + ) + .unwrap() +} + +#[test] +fn forced_cram_backend_reports_reference_and_coordinate_errors_without_reading_cram() { + let dir = temp_dir("cram-reference-errors"); + let cram_path = dir.join("missing.cram"); + let store_without_reference = GenotypeStore::from_file_with_options( + &cram_path, + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + let err = store_without_reference + .lookup_variant(&VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }) + .unwrap_err(); + assert!( + format!("{err:?}").contains("without --reference-file"), + "{err:?}" + ); + + let store = forced_cram_store(&dir, "GRCh38.fa"); + let err = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs1".to_owned()], + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("needs GRCh37/GRCh38 coordinates")); + assert!(format!("{err:?}").contains("reference index")); + assert!(format!("{err:?}").contains("input index")); +} + +#[test] +fn forced_cram_backend_reports_snp_and_indel_argument_errors_without_reading_cram() { + let dir = temp_dir("cram-variant-argument-errors"); + let store = forced_cram_store(&dir, "GRCh38.fa"); + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Snp), + alternate: Some("G".to_owned()), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("SNP variant requires ref/reference")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("SNP variant requires alt/alternate")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Deletion), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("deletion variant requires deletion_length")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Indel), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("indel variant requires ref/reference")); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("indel variant requires alt/alternate")); +} + +#[test] +fn forced_cram_backend_reports_file_and_assembly_errors_without_reading_cram() { + let dir = temp_dir("cram-file-assembly-errors"); + let store = forced_cram_store(&dir, "GRCh38.fa"); + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!( + format!("{err:?}").contains("failed to open indexed FASTA"), + "{err:?}" + ); + + let err = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Other), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("does not yet support Other")); + + let hg19_store = forced_cram_store(&dir, "hg19.fa"); + let err = hg19_store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + kind: Some(VariantKind::Other), + ..VariantSpec::default() + }) + .unwrap_err(); + assert!(format!("{err:?}").contains("does not yet support Other")); +} + +fn run_large_cram_tests() -> bool { + env::var_os("BIOSCRIPT_RUN_LARGE_TESTS").is_some() +} + +fn require_large_cram_tests(test_name: &str) -> bool { + if run_large_cram_tests() { + true + } else { + eprintln!("skipping {test_name}: set BIOSCRIPT_RUN_LARGE_TESTS=1 to enable"); + false + } +} + +fn cram_fixture_or_skip(test_name: &str) -> Option { + if !require_large_cram_tests(test_name) { + return None; + } + let root = shared_test_data_root()?; + let fx = CramFixture { + cram: root.join("1k-genomes/aligned/NA06985.final.cram"), + reference: root.join("1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"), + reference_index: root.join("1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"), + input_index: root.join("1k-genomes/aligned/NA06985.final.cram.crai"), + }; + for p in [ + &fx.cram, + &fx.reference, + &fx.reference_index, + &fx.input_index, + ] { + if !p.exists() { + eprintln!("skipping {test_name}: missing {}", p.display()); + return None; + } + } + Some(fx) +} + +fn chr_y_cram_fixture_or_skip(test_name: &str) -> Option { + let root = shared_test_data_root()?; + let fx = CramFixture { + cram: root.join("NA06985-chrY/aligned/NA06985.final.chrY.cram"), + reference: root.join("NA06985-chrY/ref/GRCh38_chrY.fa"), + reference_index: root.join("NA06985-chrY/ref/GRCh38_chrY.fa.fai"), + input_index: root.join("NA06985-chrY/aligned/NA06985.final.chrY.cram.crai"), + }; + for p in [ + &fx.cram, + &fx.reference, + &fx.reference_index, + &fx.input_index, + ] { + if !p.exists() { + eprintln!("skipping {test_name}: missing {}", p.display()); + return None; + } + } + Some(fx) +} + +fn open_cram_store(fx: &CramFixture) -> GenotypeStore { + open_cram_store_with_md5_policy(fx, false) +} + +fn open_cram_store_with_md5_policy( + fx: &CramFixture, + allow_reference_md5_mismatch: bool, +) -> GenotypeStore { + GenotypeStore::from_file_with_options( + &fx.cram, + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Cram), + input_index: Some(fx.input_index.clone()), + reference_file: Some(fx.reference.clone()), + reference_index: Some(fx.reference_index.clone()), + allow_reference_md5_mismatch, + }, + ) + .expect("open cram store") +} + +fn mini_fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures") +} + +fn mini_cram_fixture() -> CramFixture { + let dir = mini_fixtures_dir(); + CramFixture { + cram: dir.join("mini.cram"), + reference: dir.join("mini.fa"), + reference_index: dir.join("mini.fa.fai"), + input_index: dir.join("mini.cram.crai"), + } +} + +fn mini_cram_fixture_with_bad_ref() -> CramFixture { + let dir = mini_fixtures_dir(); + CramFixture { + cram: dir.join("mini.cram"), + reference: dir.join("mini_bad_ref.fa"), + reference_index: dir.join("mini_bad_ref.fa.fai"), + input_index: dir.join("mini.cram.crai"), + } +} + +#[test] +fn cram_mini_fixture_streams_only_locus_overlapping_reads() { + // mini.cram has 2000 reads covering chr_test:500..2499. The streaming path + // should decode roughly until it passes the locus and stop — correctness is + // asserted via depth (exactly 50 reads overlap a single base in the middle). + // If the streaming + early-termination path breaks and falls back to full + // slice decode, the wall time still finishes fine on 2000 reads but this + // test also catches regressions that double-count or miss reads. + let fx = mini_cram_fixture(); + let store = open_cram_store(&fx); + + let start = std::time::Instant::now(); + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["mini_locus_1000".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "chr_test".to_owned(), + start: 1000, + end: 1000, + }), + reference: Some("A".to_owned()), + alternate: Some("C".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("mini cram lookup"); + let elapsed = start.elapsed(); + + assert_eq!(observation.backend, "cram"); + assert_eq!( + observation.depth.unwrap_or(0), + 50, + "expected exactly 50 reads overlapping chr_test:1000, got {:?}", + observation.depth + ); + // All reads match reference in the fixture so alt_count should be zero. + assert_eq!(observation.ref_count.unwrap_or(0), 50); + assert_eq!(observation.alt_count.unwrap_or(0), 0); + assert!( + elapsed.as_millis() < 2000, + "mini CRAM lookup took {elapsed:?}, expected well under 2s" + ); +} + +#[test] +fn cram_mini_fixture_md5_mismatch_is_tolerated_when_allowed() { + // mini_bad_ref.fa has a single-base mutation at chr_test:2800, inside the + // slice span but far from our query locus at 1000. noodles' strict MD5 + // check will fail; bioscript must warn + retry unchecked + still return + // the correct genotype (the bases at pos 1000 are identical in both refs). + let fx = mini_cram_fixture_with_bad_ref(); + let store = open_cram_store_with_md5_policy(&fx, true); + + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["mini_locus_1000".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "chr_test".to_owned(), + start: 1000, + end: 1000, + }), + reference: Some("A".to_owned()), + alternate: Some("C".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("mini cram lookup should succeed via md5 fallback"); + + assert_eq!(observation.backend, "cram"); + assert_eq!( + observation.depth.unwrap_or(0), + 50, + "expected exactly 50 reads after md5 fallback, got {:?}", + observation.depth + ); + // Bases at the query locus are the same in both references, so the + // fallback-decoded reads should still be ref-homozygous. + assert_eq!(observation.ref_count.unwrap_or(0), 50); + assert_eq!(observation.alt_count.unwrap_or(0), 0); +} + +#[test] +fn cram_chr_y_fixture_lookup_is_fast_and_correct() { + let Some(fx) = chr_y_cram_fixture_or_skip("cram_chr_y_fixture_lookup_is_fast_and_correct") + else { + return; + }; + let store = open_cram_store(&fx); + + let start = std::time::Instant::now(); + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["chrY_smoke_3449570".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "chrY".to_owned(), + start: 3_449_570, + end: 3_449_570, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("chrY lookup"); + let elapsed = start.elapsed(); + + assert_eq!(observation.backend, "cram"); + let depth = observation.depth.unwrap_or(0); + assert!( + depth >= 8, + "expected >=8 reads at chrY smoke locus, got {depth}" + ); + assert_eq!(observation.alt_count.unwrap_or(0), 0); + assert!( + observation.ref_count.unwrap_or(0) >= 8, + "expected ref-supporting reads at chrY smoke locus, got {:?}", + observation.ref_count + ); + assert!( + elapsed.as_secs() < 5, + "chrY CRAM lookup took {elapsed:?}, expected <5s" + ); +} + +#[test] +fn cram_apol1_snp_lookup_is_fast_and_correct() { + let Some(fx) = cram_fixture_or_skip("cram_apol1_snp_lookup_is_fast_and_correct") else { + return; + }; + let store = open_cram_store(&fx); + + let start = std::time::Instant::now(); + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs73885319".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "22".to_owned(), + start: 36_265_860, + end: 36_265_860, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("apol1 lookup"); + let elapsed = start.elapsed(); + + assert_eq!(observation.backend, "cram"); + // NA06985 is reference-homozygous at APOL1 G1 site 1 per samtools mpileup. + let depth = observation.depth.unwrap_or(0); + assert!( + depth >= 10, + "expected >=10 reads at APOL1 locus, got {depth}" + ); + let ref_count = observation.ref_count.unwrap_or(0); + let alt_count = observation.alt_count.unwrap_or(0); + assert!( + ref_count > alt_count, + "NA06985 APOL1 G1 site 1 should be ref-dominant: ref={ref_count} alt={alt_count}" + ); + + // Slice-level CRAM decode is the hot path. Samtools does the same locus + // in ~40ms; we allow a generous ceiling to catch regressions (e.g. if the + // streaming/early-termination path breaks and we fall back to decoding + // every record in the slice, this blows past 10s). + assert!( + elapsed.as_secs() < 5, + "APOL1 CRAM lookup took {elapsed:?}, expected <5s (samtools does it in ~40ms)" + ); +} + +#[test] +fn cram_md5_mismatch_is_tolerated_and_returns_correct_result() { + // For NA06985.final.cram, the bundled GRCh38 FASTA's chr6 MD5 does not + // match the @SQ M5 the CRAM was encoded against (only chr22 matches). + // We must warn + fall back to unchecked decoding, and still return the + // correct genotype at the GLP1 rs10305420 locus. The correct call per + // samtools mpileup is reference-homozygous (CC). + let Some(fx) = + cram_fixture_or_skip("cram_md5_mismatch_is_tolerated_and_returns_correct_result") + else { + return; + }; + let store = open_cram_store_with_md5_policy(&fx, true); + + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs10305420".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "6".to_owned(), + start: 39_048_860, + end: 39_048_860, + }), + reference: Some("C".to_owned()), + alternate: Some("T".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("glp1 lookup should succeed via md5 fallback"); + + assert_eq!(observation.backend, "cram"); + let depth = observation.depth.unwrap_or(0); + assert!( + depth >= 10, + "expected >=10 reads at GLP1 locus after md5 fallback, got {depth}" + ); + let genotype = observation + .genotype + .as_deref() + .expect("expected a genotype call"); + assert!( + genotype.chars().all(|c| c == 'C' || c == 'T'), + "unexpected genotype after md5 fallback: {genotype}" + ); + + // Parity with `samtools mpileup -f -r chr6:39048860-39048860`: + // that locus shows a mixed pileup (roughly half reference C, half T). + // We assert total depth matches samtools' reported depth within a small + // tolerance — confirms we are not silently dropping or duplicating reads + // after the unchecked-reference fallback. + let depth_i32 = i32::try_from(depth).unwrap_or(i32::MAX); + let samtools_depth: i32 = 41; + assert!( + (depth_i32 - samtools_depth).abs() <= 6, + "depth {depth_i32} differs from samtools mpileup depth {samtools_depth} by >6" + ); +} + +#[test] +fn cram_rs9357296_reports_heterozygous_counts_for_na06985() { + let Some(fx) = cram_fixture_or_skip("cram_rs9357296_reports_heterozygous_counts_for_na06985") + else { + return; + }; + let store = open_cram_store_with_md5_policy(&fx, true); + + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs9357296".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "6".to_owned(), + start: 39_011_352, + end: 39_011_352, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("rs9357296 lookup"); + + assert_eq!(observation.backend, "cram"); + assert_eq!(observation.genotype.as_deref(), Some("AG")); + assert_eq!(observation.raw_counts.get("A").copied(), Some(18)); + assert_eq!(observation.raw_counts.get("G").copied(), Some(12)); + assert_eq!(observation.raw_counts.get("T").copied(), None); + assert_eq!(observation.depth, Some(29)); + assert_eq!(observation.ref_count, Some(17)); + assert_eq!(observation.alt_count, Some(12)); + assert!( + observation + .decision + .as_deref() + .is_some_and(|text| text.contains("alt_fraction=0.414")), + "missing SNP decision summary: {:?}", + observation.decision + ); + assert!( + observation + .evidence + .iter() + .any(|line| line.contains("raw pileup depth=30")), + "missing raw pileup evidence: {:?}", + observation.evidence + ); + assert!( + observation.evidence.iter().any(|line| { + line.contains("filtered_duplicate=4") + && line.contains("filtered_low_base_quality=1") + && line.contains("filtered_improper_pair=0") + }), + "missing mpileup-style filter evidence: {:?}", + observation.evidence + ); +} diff --git a/rust/bioscript-formats/tests/file_formats/delimited.rs b/rust/bioscript-formats/tests/file_formats/delimited.rs new file mode 100644 index 0000000..350672e --- /dev/null +++ b/rust/bioscript-formats/tests/file_formats/delimited.rs @@ -0,0 +1,103 @@ +use super::*; + +#[test] +fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { + let dir = temp_dir("csv-split-alleles"); + let path = dir.join("sample.csv"); + fs::write( + &path, + "\n\ + # rsid,chromosome,position,allele1,allele2\n\ + // ignored comment\n\ + rs73885319,chr22,36265860,a,g\n\ + rs60910145,22,36265900,n/a,\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); + + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "22".to_owned(), + start: 36_265_860, + end: 36_265_861, + }), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(observation.genotype.as_deref(), Some("AG")); + assert_eq!( + observation.evidence, + vec!["resolved by locus chr22:36265860".to_owned()] + ); +} + +#[test] +fn delimited_parser_uses_comment_headers_aliases_quotes_and_extra_columns() { + let dir = temp_dir("comment-header-aliases"); + let path = dir.join("sample.csv"); + fs::write( + &path, + "# SNP ID, Chrom, Base Pair Position, Result, Ignored\n\ + \"rsQuoted\", \"chr3\", \"300\", \"a t\", \"unused, value\"\n\ + rsSlash,3,301,A/-,\n\ + rsNone,3,302,None,\n\ + no_position,3,,AG,\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + + assert_eq!(store.get("rsQuoted").unwrap().as_deref(), Some("AT")); + assert_eq!(store.get("rsSlash").unwrap().as_deref(), Some("ID")); + assert_eq!(store.get("rsNone").unwrap().as_deref(), Some("--")); + assert_eq!(store.get("no_position").unwrap().as_deref(), Some("AG")); + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "3".to_owned(), + start: 300, + end: 300, + }), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(observation.genotype.as_deref(), Some("AT")); +} + +#[test] +fn delimited_parser_handles_space_delimited_rows_without_headers_and_inline_comments() { + let dir = temp_dir("space-default-header"); + let path = dir.join("sample.txt"); + fs::write( + &path, + "\n\ + rsSpace chr2 200 tc # inline comment\n\ + chrOnly chr2 201 aa\n\ + badrow\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + + assert_eq!(store.get("rsSpace").unwrap().as_deref(), Some("TC")); + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "2".to_owned(), + start: 201, + end: 201, + }), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(observation.genotype.as_deref(), Some("AA")); + assert_eq!( + observation.evidence, + vec!["resolved by locus chr2:201".to_owned()] + ); +} diff --git a/rust/bioscript-formats/tests/file_formats/vcf.rs b/rust/bioscript-formats/tests/file_formats/vcf.rs new file mode 100644 index 0000000..0496b82 --- /dev/null +++ b/rust/bioscript-formats/tests/file_formats/vcf.rs @@ -0,0 +1,230 @@ +use super::*; + +#[test] +fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { + let dir = temp_dir("vcf-chr-normalize"); + let path = dir.join("sample.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=GRCh38\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + chr1\t1000\t.\tA\tC,G\t.\tPASS\t.\tGT\t2/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + let observation = store + .lookup_variant(&VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 1000, + end: 1001, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap(); + + assert_eq!(observation.genotype.as_deref(), Some("GC")); + assert_eq!(observation.assembly, Some(bioscript_core::Assembly::Grch38)); + assert_eq!( + observation.evidence, + vec!["resolved by locus chr1:1000".to_owned()] + ); +} + +#[test] +fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { + let dir = temp_dir("vcf-indel-locus"); + let path = dir.join("sample.hg19.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=hg19\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t99\t.\tAT\tA\t.\tPASS\t.\tGT\t0/1\n\ + chr1\t199\t.\tA\tATG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + let deletion = store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "chr1".to_owned(), + start: 100, + end: 100, + }), + reference: Some("AT".to_owned()), + alternate: Some("A".to_owned()), + kind: Some(VariantKind::Deletion), + deletion_length: Some(1), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(deletion.genotype.as_deref(), Some("ID")); + assert_eq!(deletion.assembly, Some(bioscript_core::Assembly::Grch37)); + assert_eq!(deletion.evidence, vec!["resolved by locus 1:99".to_owned()]); + + let insertion = store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 200, + end: 200, + }), + reference: Some("A".to_owned()), + alternate: Some("ATG".to_owned()), + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(insertion.genotype.as_deref(), Some("DI")); + assert_eq!( + insertion.evidence, + vec!["resolved by locus chr1:199".to_owned()] + ); + + let unresolved = store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 300, + end: 300, + }), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(unresolved.genotype, None); + assert_eq!( + unresolved.evidence, + vec!["no matching rsid or locus found for variant_by_locus".to_owned()] + ); +} + +#[test] +fn vcf_variant_lookup_reads_single_sample_calls() { + let dir = temp_dir("vcf"); + let vcf_path = dir.join("apol1_sample.vcf"); + fs::write( + &vcf_path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ +22\t36265900\trs60910145\tT\tG\t.\tPASS\t.\tGT\t1/1\n\ +22\t36266005\trs71785313\tA\tATTTAA\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&vcf_path).unwrap(); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("GG")); + + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs71785313".to_owned()], + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(observation.genotype.as_deref(), Some("DI")); +} + +#[test] +fn vcf_variant_lookup_ignores_symbolic_non_ref_alt_when_decoding_gt() { + let dir = temp_dir("vcf-non-ref"); + let vcf_path = dir.join("sample.g.vcf"); + fs::write( + &vcf_path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +6\t39016636\trs10305420\tC\tT,\t.\tPASS\t.\tGT\t0/1\n\ +6\t38979128\trs9357296\tA\tG,\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&vcf_path).unwrap(); + assert_eq!(store.get("rs10305420").unwrap().as_deref(), Some("CT")); + assert_eq!(store.get("rs9357296").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn real_world_clean_vcf_supports_locus_lookup_without_rsids() { + let test_name = "real_world_clean_vcf_supports_locus_lookup_without_rsids"; + let Some(clean_vcf) = shared_fixture_or_skip(test_name, "1k-genomes/vcf/NA06985.clean.vcf.gz") + else { + return; + }; + let Some(original_vcf) = shared_fixture_or_skip(test_name, "1k-genomes/vcf/NA06985.vcf.gz") + else { + return; + }; + + let clean_store = GenotypeStore::from_file(&clean_vcf).expect("open cleaned VCF"); + let original_store = GenotypeStore::from_file(&original_vcf).expect("open original VCF"); + + let queries = [ + ( + VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 12_783, + end: 12_783, + }), + reference: Some("G".to_owned()), + alternate: Some("A".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + "GA", + ), + ( + VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 13_110, + end: 13_110, + }), + reference: Some("G".to_owned()), + alternate: Some("A".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + "GA", + ), + ( + VariantSpec { + rsids: vec!["rs78601809".to_owned()], + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 15_211, + end: 15_211, + }), + reference: Some("T".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + "GG", + ), + ]; + + for (query, expected_genotype) in queries { + let clean = clean_store.lookup_variant(&query).expect("clean lookup"); + let original = original_store + .lookup_variant(&query) + .expect("original lookup"); + + assert_eq!(clean.backend, "vcf"); + assert_eq!(clean.genotype.as_deref(), Some(expected_genotype)); + assert_eq!(original.genotype, clean.genotype); + } +} diff --git a/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs new file mode 100644 index 0000000..0c40663 --- /dev/null +++ b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs @@ -0,0 +1,230 @@ +use super::*; + +#[test] +fn batch_lookup_preserves_input_order_after_coordinate_sorting() { + let dir = temp_dir("batch-order"); + let path = dir.join("sample.txt"); + fs::write( + &path, + "rsid\tchromosome\tposition\tgenotype\n\ + rs2\t1\t20\tCT\n\ + rs1\t1\t10\tAG\n", + ) + .unwrap(); + let store = GenotypeStore::from_file(&path).unwrap(); + + let results = store + .lookup_variants(&[ + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 20, + end: 20, + }), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 10, + end: 10, + }), + ..VariantSpec::default() + }, + ]) + .unwrap(); + + assert_eq!(results[0].genotype.as_deref(), Some("CT")); + assert_eq!( + results[0].evidence, + vec!["resolved by locus 1:20".to_owned()] + ); + assert_eq!(results[1].genotype.as_deref(), Some("AG")); + assert_eq!( + results[1].evidence, + vec!["resolved by locus 1:10".to_owned()] + ); +} + +#[test] +fn zip_genotype_file_is_auto_detected_and_readable() { + let dir = temp_dir("zip-auto"); + let zip_path = dir.join("apol1-input.zip"); + + let file = fs::File::create(&zip_path).unwrap(); + let mut writer = zip::ZipWriter::new(file); + writer + .start_file("test_snps.txt", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all( + b"rsid\tchromosome\tposition\tgenotype\nrs73885319\t22\t1\tAG\nrs60910145\t22\t2\tTG\nrs71785313\t22\t3\tII\n", + ) + .unwrap(); + writer.finish().unwrap(); + + let store = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); + assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("TG")); + assert_eq!(store.get("rs71785313").unwrap().as_deref(), Some("II")); +} + +#[test] +fn zip_genotype_file_can_be_forced_by_format() { + let dir = temp_dir("zip-forced"); + let zip_path = dir.join("apol1-input.dat"); + + let file = fs::File::create(&zip_path).unwrap(); + let mut writer = zip::ZipWriter::new(file); + writer + .start_file("test_snps.txt", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all(b"rsid\tchromosome\tposition\tgenotype\nrs73885319\t22\t1\tAG\n") + .unwrap(); + writer.finish().unwrap(); + + let store = GenotypeStore::from_file_with_options( + &zip_path, + &GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Zip), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn zip_vcf_entry_is_auto_detected_and_readable() { + let dir = temp_dir("zip-vcf"); + let zip_path = dir.join("apol1-sample.zip"); + + let file = fs::File::create(&zip_path).unwrap(); + let mut writer = zip::ZipWriter::new(file); + writer + .start_file("nested/sample.vcf", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all( + b"##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + writer.finish().unwrap(); + + let store = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn zip_vcf_gz_entry_is_selected_and_read_as_vcf() { + let dir = temp_dir("zip-vcf-gz-entry"); + let zip_path = dir.join("sample.zip"); + + let file = fs::File::create(&zip_path).unwrap(); + let mut writer = zip::ZipWriter::new(file); + writer + .add_directory("nested/", SimpleFileOptions::default()) + .unwrap(); + writer + .start_file("nested/sample.vcf.gz", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all( + b"##fileformat=VCFv4.2\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +2\t22\trsZipVcfGz\tG\tA\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + writer.finish().unwrap(); + + let store = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(store.backend_name(), "vcf"); + assert_eq!(store.get("rsZipVcfGz").unwrap().as_deref(), Some("GA")); +} + +#[test] +fn shared_real_world_zipped_genotype_exports_are_readable() { + struct FixtureExpectation { + relative: &'static str, + rsid: &'static str, + genotype: &'static str, + } + + let fixtures = [ + FixtureExpectation { + relative: "23andme/v2/hu0199C8/23data20100526.txt.zip", + rsid: "rs3094315", + genotype: "AA", + }, + FixtureExpectation { + relative: "23andme/v3/huE4DAE4/huE4DAE4_20120522224129.txt.zip", + rsid: "rs3131972", + genotype: "GG", + }, + FixtureExpectation { + relative: "23andme/v4/huE18D82/genome__v4_Full_2016.txt.zip", + rsid: "rs3131972", + genotype: "AG", + }, + FixtureExpectation { + relative: "23andme/v5/hu50B3F5/genome_hu50B3F5_v5_Full.zip", + rsid: "rs116587930", + genotype: "GG", + }, + FixtureExpectation { + relative: "dynamicdna/100001-synthetic/100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip", + rsid: "rs116587930", + genotype: "GG", + }, + FixtureExpectation { + relative: "ancestrydna/huE922FC/AncestryDNA.txt.zip", + rsid: "rs3131972", + genotype: "GG", + }, + FixtureExpectation { + relative: "familytreedna/hu17B792/2017-04-29_Family_Tree_DNA_Data.csv.zip", + rsid: "rs1000530", + genotype: "TT", + }, + FixtureExpectation { + relative: "genesforgood/hu80B047/GFG0_filtered_imputed_genotypes_noY_noMT_23andMe.txt.zip", + rsid: "rs3094315", + genotype: "AA", + }, + FixtureExpectation { + relative: "myheritage/hu33515F/MyHeritage_raw_dna_data.zip", + rsid: "rs3131972", + genotype: "GG", + }, + ]; + + for fixture in fixtures { + let Some(path) = shared_fixture_or_skip( + "shared_real_world_zipped_genotype_exports_are_readable", + fixture.relative, + ) else { + return; + }; + + let store = GenotypeStore::from_file(&path).unwrap(); + assert_eq!( + store.get(fixture.rsid).unwrap().as_deref(), + Some(fixture.genotype), + "fixture {}", + fixture.relative + ); + } +} + +#[test] +fn bundled_dynamicdna_gsav3_plain_text_fixture_is_readable() { + let path = repo_root() + .join("old/examples/apol1/genotype_files/108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt"); + let store = GenotypeStore::from_file(&path).unwrap(); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AA")); +} diff --git a/rust/bioscript-wasm/src/inspect_api.rs b/rust/bioscript-wasm/src/inspect_api.rs new file mode 100644 index 0000000..cbe67d6 --- /dev/null +++ b/rust/bioscript-wasm/src/inspect_api.rs @@ -0,0 +1,159 @@ +use std::path::PathBuf; + +use bioscript_formats::{ + inspect_bytes as inspect_bytes_rs, DetectedKind, DetectionConfidence, FileContainer, + FileInspection, InspectOptions, SourceMetadata, +}; +use bioscript_schema::resolve_remote_resource_text as resolve_remote_resource_text_rs; +use serde::{Deserialize, Serialize}; +use wasm_bindgen::prelude::*; + +#[derive(Default, Deserialize)] +struct InspectOptionsJs { + input_index: Option, + reference_file: Option, + reference_index: Option, +} + +/// Classify bytes as a known genomic file. Mirrors `bioscript-formats::inspect::inspect_bytes`. +/// Returns JSON matching the `Inspection` shape the app already uses. +#[wasm_bindgen(js_name = inspectBytes)] +pub fn inspect_bytes( + name: &str, + bytes: &[u8], + options_json: Option, +) -> Result { + let options_js: InspectOptionsJs = match options_json { + Some(text) if !text.is_empty() => serde_json::from_str(&text) + .map_err(|err| JsError::new(&format!("invalid InspectOptions JSON: {err}")))?, + _ => InspectOptionsJs::default(), + }; + let options = InspectOptions { + input_index: options_js.input_index.map(PathBuf::from), + reference_file: options_js.reference_file.map(PathBuf::from), + reference_index: options_js.reference_index.map(PathBuf::from), + }; + + let inspection = inspect_bytes_rs(name, bytes, &options) + .map_err(|err| JsError::new(&format!("inspect_bytes failed: {err:?}")))?; + + let resp = InspectionJs::from(inspection); + serde_json::to_string(&resp) + .map_err(|err| JsError::new(&format!("failed to encode response: {err}"))) +} + +/// Classify a fetched remote resource and return dependency requirements. +/// +/// Network access stays in the host app so each platform can prompt before +/// fetching. The schema/type/dependency logic lives here so web, mobile, +/// desktop, and CLI share one implementation. +#[wasm_bindgen(js_name = resolveRemoteResourceText)] +pub fn resolve_remote_resource_text( + source_url: &str, + name: &str, + text: &str, +) -> Result { + let resolved = resolve_remote_resource_text_rs(source_url, name, text) + .map_err(|err| JsError::new(&format!("resolve remote resource failed: {err}")))?; + serde_json::to_string(&resolved) + .map_err(|err| JsError::new(&format!("failed to encode response: {err}"))) +} + +// Wire types — we flatten the Rust FileInspection into the shape the app's +// TS Inspection type already expects (matches widgets/FilePicker/types.ts). +#[derive(Serialize)] +struct InspectionJs { + #[serde(rename = "fileName")] + file_name: String, + container: &'static str, + #[serde(rename = "detectedKind")] + detected_kind: &'static str, + confidence: &'static str, + assembly: Option<&'static str>, + phased: Option, + source: Option, + #[serde(rename = "selectedEntry", skip_serializing_if = "Option::is_none")] + selected_entry: Option, + #[serde(rename = "hasIndex", skip_serializing_if = "Option::is_none")] + has_index: Option, + #[serde(rename = "referenceMatches", skip_serializing_if = "Option::is_none")] + reference_matches: Option, + evidence: Vec, + warnings: Vec, + #[serde(rename = "durationMs")] + duration_ms: u128, +} + +#[derive(Serialize)] +struct SourceJs { + vendor: String, + #[serde(rename = "platformVersion", skip_serializing_if = "Option::is_none")] + platform_version: Option, + confidence: &'static str, + evidence: Vec, +} + +impl From for InspectionJs { + fn from(i: FileInspection) -> Self { + InspectionJs { + file_name: i.path.display().to_string(), + container: render_container(i.container), + detected_kind: render_kind(i.detected_kind), + confidence: render_confidence(i.confidence), + assembly: i.assembly.map(render_assembly), + phased: i.phased, + source: i.source.map(SourceJs::from), + selected_entry: i.selected_entry, + has_index: i.has_index, + reference_matches: i.reference_matches, + evidence: i.evidence, + warnings: i.warnings, + duration_ms: i.duration_ms, + } + } +} + +impl From for SourceJs { + fn from(s: SourceMetadata) -> Self { + SourceJs { + vendor: s.vendor.unwrap_or_default(), + platform_version: s.platform_version, + confidence: render_confidence(s.confidence), + evidence: s.evidence, + } + } +} + +fn render_container(c: FileContainer) -> &'static str { + match c { + FileContainer::Plain => "plain", + FileContainer::Zip => "zip", + } +} + +fn render_kind(k: DetectedKind) -> &'static str { + match k { + DetectedKind::GenotypeText => "genotype_text", + DetectedKind::Vcf => "vcf", + DetectedKind::AlignmentCram => "alignment_cram", + DetectedKind::AlignmentBam => "alignment_bam", + DetectedKind::ReferenceFasta => "reference_fasta", + DetectedKind::Unknown => "unknown", + } +} + +fn render_confidence(c: DetectionConfidence) -> &'static str { + match c { + DetectionConfidence::Authoritative => "authoritative", + DetectionConfidence::StrongHeuristic => "strong_heuristic", + DetectionConfidence::WeakHeuristic => "weak_heuristic", + DetectionConfidence::Unknown => "unknown", + } +} + +pub(crate) fn render_assembly(a: bioscript_core::Assembly) -> &'static str { + match a { + bioscript_core::Assembly::Grch37 => "grch37", + bioscript_core::Assembly::Grch38 => "grch38", + } +} diff --git a/rust/bioscript-wasm/src/lib.rs b/rust/bioscript-wasm/src/lib.rs index b2c5607..b438a72 100644 --- a/rust/bioscript-wasm/src/lib.rs +++ b/rust/bioscript-wasm/src/lib.rs @@ -14,631 +14,20 @@ //! - Index-less fallback (linear scan or on-the-fly index build). //! - Indel / deletion observations on CRAM. +mod inspect_api; mod js_reader; +mod lookup_api; +mod variant_yaml; -use std::{io::BufReader, path::PathBuf}; - -use bioscript_core::{GenomicLocus, VariantKind, VariantObservation, VariantSpec}; -use bioscript_formats::{ - alignment, inspect_bytes as inspect_bytes_rs, observe_cram_indel_with_reader, - observe_cram_snp_with_reader, observe_vcf_snp_with_reader, DetectedKind, DetectionConfidence, - FileContainer, FileInspection, GenotypeStore, InspectOptions, SourceMetadata, -}; -use bioscript_schema::{ - load_variant_manifest_text_for_lookup, - resolve_remote_resource_text as resolve_remote_resource_text_rs, +pub use inspect_api::{inspect_bytes, resolve_remote_resource_text}; +pub use lookup_api::{ + lookup_cram_variants, lookup_genotype_bytes_rsids, lookup_genotype_bytes_variants, + lookup_vcf_variants, }; -use noodles::csi as noodles_csi; -use serde::{Deserialize, Serialize}; -use wasm_bindgen::prelude::*; +pub use variant_yaml::compile_variant_yaml_text; -use crate::js_reader::JsReader; - -#[wasm_bindgen(start)] +#[wasm_bindgen::prelude::wasm_bindgen(start)] pub fn start() { #[cfg(feature = "console_error_panic_hook")] console_error_panic_hook::set_once(); } - -#[derive(Default, Deserialize)] -struct InspectOptionsJs { - input_index: Option, - reference_file: Option, - reference_index: Option, -} - -/// Classify bytes as a known genomic file. Mirrors `bioscript-formats::inspect::inspect_bytes`. -/// Returns JSON matching the `Inspection` shape the app already uses. -#[wasm_bindgen(js_name = inspectBytes)] -pub fn inspect_bytes( - name: &str, - bytes: &[u8], - options_json: Option, -) -> Result { - let options_js: InspectOptionsJs = match options_json { - Some(text) if !text.is_empty() => serde_json::from_str(&text) - .map_err(|err| JsError::new(&format!("invalid InspectOptions JSON: {err}")))?, - _ => InspectOptionsJs::default(), - }; - let options = InspectOptions { - input_index: options_js.input_index.map(PathBuf::from), - reference_file: options_js.reference_file.map(PathBuf::from), - reference_index: options_js.reference_index.map(PathBuf::from), - }; - - let inspection = inspect_bytes_rs(name, bytes, &options) - .map_err(|err| JsError::new(&format!("inspect_bytes failed: {err:?}")))?; - - let resp = InspectionJs::from(inspection); - serde_json::to_string(&resp) - .map_err(|err| JsError::new(&format!("failed to encode response: {err}"))) -} - -/// Classify a fetched remote resource and return dependency requirements. -/// -/// Network access stays in the host app so each platform can prompt before -/// fetching. The schema/type/dependency logic lives here so web, mobile, -/// desktop, and CLI share one implementation. -#[wasm_bindgen(js_name = resolveRemoteResourceText)] -pub fn resolve_remote_resource_text( - source_url: &str, - name: &str, - text: &str, -) -> Result { - let resolved = resolve_remote_resource_text_rs(source_url, name, text) - .map_err(|err| JsError::new(&format!("resolve remote resource failed: {err}")))?; - serde_json::to_string(&resolved) - .map_err(|err| JsError::new(&format!("failed to encode response: {err}"))) -} - -#[derive(Deserialize)] -struct VariantInput { - name: String, - chrom: String, - // 1-based genomic interval. `pos` is accepted for older callers. - #[serde(default)] - pos: Option, - #[serde(default)] - start: Option, - #[serde(default)] - end: Option, - #[serde(rename = "ref")] - ref_base: String, - #[serde(rename = "alt")] - alt_base: String, - #[serde(default)] - rsid: Option, - #[serde(default)] - assembly: Option, - #[serde(default)] - kind: Option, -} - -#[derive(Serialize)] -struct CompiledVariantSpecJs { - name: String, - chrom: String, - start: i64, - end: i64, - #[serde(rename = "ref")] - ref_base: String, - #[serde(rename = "alt")] - alt_base: String, - #[serde(skip_serializing_if = "Option::is_none")] - rsid: Option, - #[serde(skip_serializing_if = "Option::is_none")] - assembly: Option, - #[serde(skip_serializing_if = "Option::is_none")] - kind: Option, -} - -#[wasm_bindgen(js_name = compileVariantYamlText)] -pub fn compile_variant_yaml_text(name: &str, text: &str) -> Result { - let manifest = load_variant_manifest_text_for_lookup(name, text) - .map_err(|err| JsError::new(&format!("compile variant YAML failed: {err}")))?; - let spec = manifest.spec; - let ref_base = spec - .reference - .clone() - .ok_or_else(|| JsError::new(&format!("variant {}: alleles.ref missing", manifest.name)))?; - let alt_base = spec - .alternate - .clone() - .ok_or_else(|| JsError::new(&format!("variant {}: alleles.alts missing", manifest.name)))?; - let rsid = spec.rsids.first().cloned(); - let kind = spec.kind.map(|kind| { - match kind { - bioscript_core::VariantKind::Snp => "snv", - bioscript_core::VariantKind::Insertion => "insertion", - bioscript_core::VariantKind::Deletion => "deletion", - bioscript_core::VariantKind::Indel => "indel", - bioscript_core::VariantKind::Other => "other", - } - .to_owned() - }); - let mut out = Vec::new(); - if let Some(locus) = spec.grch38 { - out.push(CompiledVariantSpecJs { - name: manifest.name.clone(), - chrom: locus.chrom, - start: locus.start, - end: locus.end, - ref_base: ref_base.clone(), - alt_base: alt_base.clone(), - rsid: rsid.clone(), - assembly: Some("grch38".to_owned()), - kind: kind.clone(), - }); - } - if let Some(locus) = spec.grch37 { - out.push(CompiledVariantSpecJs { - name: if out.is_empty() { - manifest.name.clone() - } else { - format!("{}_grch37", manifest.name) - }, - chrom: locus.chrom, - start: locus.start, - end: locus.end, - ref_base, - alt_base, - rsid, - assembly: Some("grch37".to_owned()), - kind, - }); - } - if out.is_empty() { - return Err(JsError::new(&format!( - "variant {} has no coordinates", - manifest.name - ))); - } - serde_json::to_string(&out) - .map_err(|err| JsError::new(&format!("failed to encode compiled variant: {err}"))) -} - -#[derive(Serialize)] -struct VariantObservationJs { - name: String, - backend: String, - #[serde(rename = "ref", skip_serializing_if = "Option::is_none")] - reference: Option, - #[serde(rename = "alt", skip_serializing_if = "Option::is_none")] - alternate: Option, - #[serde(rename = "matchedRsid", skip_serializing_if = "Option::is_none")] - matched_rsid: Option, - #[serde(skip_serializing_if = "Option::is_none")] - assembly: Option, - #[serde(skip_serializing_if = "Option::is_none")] - genotype: Option, - #[serde(rename = "refCount", skip_serializing_if = "Option::is_none")] - ref_count: Option, - #[serde(rename = "altCount", skip_serializing_if = "Option::is_none")] - alt_count: Option, - #[serde(skip_serializing_if = "Option::is_none")] - depth: Option, - #[serde(rename = "rawCounts")] - raw_counts: std::collections::BTreeMap, - #[serde(skip_serializing_if = "Option::is_none")] - decision: Option, - evidence: Vec, -} - -/// Observe a list of SNP variants against an indexed CRAM + reference FASTA, -/// with the bulk bytes pulled on demand via JS-supplied `readAt(offset, len)` -/// callbacks. The small index payloads (`.crai`, `.fai`) are passed inline. -/// -/// Both callbacks must return a `Uint8Array` synchronously (or via a Node -/// sync read) — wasm's `Read + Seek` contract is synchronous. Async reads are -/// a follow-up that needs buffered pre-fetch on the JS side. -#[wasm_bindgen(js_name = lookupCramVariants)] -pub fn lookup_cram_variants( - cram_read_at: js_sys::Function, - cram_len: f64, - crai_bytes: &[u8], - fasta_read_at: js_sys::Function, - fasta_len: f64, - fai_bytes: &[u8], - variants_json: &str, -) -> Result { - let crai_index = alignment::parse_crai_bytes(crai_bytes) - .map_err(|err| JsError::new(&format!("parse crai: {err:?}")))?; - let fai_index = alignment::parse_fai_bytes(fai_bytes) - .map_err(|err| JsError::new(&format!("parse fai: {err:?}")))?; - - let fasta_reader = BufReader::new(JsReader::new(fasta_read_at, fasta_len as u64, "fasta")); - let repository = alignment::build_reference_repository_from_readers(fasta_reader, fai_index); - - let cram_reader = JsReader::new(cram_read_at, cram_len as u64, "cram"); - let mut indexed = - alignment::build_cram_indexed_reader_from_reader(cram_reader, crai_index, repository) - .map_err(|err| JsError::new(&format!("build cram reader: {err:?}")))?; - - let variants: Vec = serde_json::from_str(variants_json) - .map_err(|err| JsError::new(&format!("parse variantsJson: {err}")))?; - - let mut results = Vec::with_capacity(variants.len()); - for variant in variants { - let assembly = variant.assembly.as_deref().and_then(parse_assembly_str); - let start = variant - .start - .or(variant.pos) - .ok_or_else(|| JsError::new(&format!("variant {}: start/pos missing", variant.name)))?; - let end = variant.end.unwrap_or(start); - let locus = GenomicLocus { - chrom: variant.chrom.clone(), - start, - end, - }; - let kind = parse_variant_kind(variant.kind.as_deref()).unwrap_or(VariantKind::Snp); - let observation = - match kind { - VariantKind::Snp => { - ensure_single_base_variant(&variant)?; - let ref_char = variant.ref_base.chars().next().ok_or_else(|| { - JsError::new(&format!("variant {}: empty ref", variant.name)) - })?; - let alt_char = variant.alt_base.chars().next().ok_or_else(|| { - JsError::new(&format!("variant {}: empty alt", variant.name)) - })?; - observe_cram_snp_with_reader( - &mut indexed, - &variant.name, - &locus, - ref_char, - alt_char, - variant.rsid.clone(), - assembly, - ) - } - VariantKind::Insertion | VariantKind::Indel => observe_cram_indel_with_reader( - &mut indexed, - &variant.name, - &locus, - &variant.ref_base, - &variant.alt_base, - variant.rsid.clone(), - assembly, - ), - other => { - return Err(JsError::new(&format!( - "variant {} has unsupported kind {:?} for web CRAM lookup", - variant.name, other - ))); - } - } - .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", variant.name)))?; - results.push(VariantObservationJs { - name: variant.name, - backend: observation.backend, - reference: Some(variant.ref_base), - alternate: Some(variant.alt_base), - matched_rsid: observation.matched_rsid, - assembly: observation.assembly.map(|a| render_assembly(a).to_owned()), - genotype: observation.genotype, - ref_count: observation.ref_count, - alt_count: observation.alt_count, - depth: observation.depth, - raw_counts: observation.raw_counts, - decision: observation.decision, - evidence: observation.evidence, - }); - } - - serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}"))) -} - -/// Observe a list of SNP variants against a bgzipped + tabix-indexed VCF, -/// with the bulk bytes pulled on demand via a JS-supplied `readAt(offset, len)` -/// callback. The small `.tbi` payload is passed inline. -/// -/// The reader must provide the VCF synchronously — on web this is a -/// `FileReaderSync`-backed callback running inside a Web Worker. -#[wasm_bindgen(js_name = lookupVcfVariants)] -pub fn lookup_vcf_variants( - vcf_read_at: js_sys::Function, - vcf_len: f64, - tbi_bytes: &[u8], - variants_json: &str, -) -> Result { - let tabix_index = alignment::parse_tbi_bytes(tbi_bytes) - .map_err(|err| JsError::new(&format!("parse tbi: {err:?}")))?; - let vcf_reader = JsReader::new(vcf_read_at, vcf_len as u64, "vcf"); - let mut indexed = noodles_csi::io::IndexedReader::new(vcf_reader, tabix_index); - - let variants: Vec = serde_json::from_str(variants_json) - .map_err(|err| JsError::new(&format!("parse variantsJson: {err}")))?; - - let mut results = Vec::with_capacity(variants.len()); - for variant in variants { - ensure_single_base_variant(&variant)?; - let ref_char = variant - .ref_base - .chars() - .next() - .ok_or_else(|| JsError::new(&format!("variant {}: empty ref", variant.name)))?; - let alt_char = variant - .alt_base - .chars() - .next() - .ok_or_else(|| JsError::new(&format!("variant {}: empty alt", variant.name)))?; - let assembly = variant.assembly.as_deref().and_then(parse_assembly_str); - let start = variant - .start - .or(variant.pos) - .ok_or_else(|| JsError::new(&format!("variant {}: start/pos missing", variant.name)))?; - let end = variant.end.unwrap_or(start); - let locus = GenomicLocus { - chrom: variant.chrom.clone(), - start, - end, - }; - let observation = observe_vcf_snp_with_reader( - &mut indexed, - &variant.name, - &locus, - ref_char, - alt_char, - variant.rsid.clone(), - assembly, - ) - .map_err(|err| JsError::new(&format!("vcf lookup {}: {err:?}", variant.name)))?; - results.push(VariantObservationJs { - name: variant.name, - backend: observation.backend, - reference: Some(variant.ref_base), - alternate: Some(variant.alt_base), - matched_rsid: observation.matched_rsid, - assembly: observation.assembly.map(|a| render_assembly(a).to_owned()), - genotype: observation.genotype, - ref_count: observation.ref_count, - alt_count: observation.alt_count, - depth: observation.depth, - raw_counts: observation.raw_counts, - decision: observation.decision, - evidence: observation.evidence, - }); - } - - serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}"))) -} - -#[wasm_bindgen(js_name = lookupGenotypeBytesVariants)] -pub fn lookup_genotype_bytes_variants( - name: &str, - bytes: &[u8], - variants_json: &str, -) -> Result { - let store = GenotypeStore::from_bytes(name, bytes) - .map_err(|err| JsError::new(&format!("load genotype bytes {name}: {err:?}")))?; - let variants: Vec = serde_json::from_str(variants_json) - .map_err(|err| JsError::new(&format!("parse variantsJson: {err}")))?; - let specs = variants - .iter() - .map(variant_input_to_spec) - .collect::, _>>()?; - let observations = store - .lookup_variants(&specs) - .map_err(|err| JsError::new(&format!("lookup genotype bytes {name}: {err:?}")))?; - let rows = variants - .into_iter() - .zip(observations) - .map(|(variant, observation)| observation_to_js(variant, observation)) - .collect::>(); - serde_json::to_string(&rows).map_err(|err| JsError::new(&format!("encode results: {err}"))) -} - -#[wasm_bindgen(js_name = lookupGenotypeBytesRsids)] -pub fn lookup_genotype_bytes_rsids( - name: &str, - bytes: &[u8], - rsids_json: &str, -) -> Result { - let store = GenotypeStore::from_bytes(name, bytes) - .map_err(|err| JsError::new(&format!("load genotype bytes {name}: {err:?}")))?; - let rsids: Vec = serde_json::from_str(rsids_json) - .map_err(|err| JsError::new(&format!("parse rsidsJson: {err}")))?; - let values = rsids - .iter() - .map(|rsid| { - store - .get(rsid) - .map_err(|err| JsError::new(&format!("lookup genotype rsid {rsid}: {err:?}"))) - }) - .collect::, _>>()?; - serde_json::to_string(&values).map_err(|err| JsError::new(&format!("encode results: {err}"))) -} - -fn ensure_single_base_variant(variant: &VariantInput) -> Result<(), JsError> { - let kind = variant - .kind - .as_deref() - .unwrap_or("snv") - .to_ascii_lowercase(); - let is_snp_kind = matches!(kind.as_str(), "snp" | "snv" | "variant" | ""); - if !is_snp_kind - || variant.ref_base.chars().count() != 1 - || variant.alt_base.chars().count() != 1 - { - return Err(JsError::new(&format!( - "variant {} has kind/ref/alt {} {}/{}; web CRAM/VCF lookup currently supports single-base SNV observations only", - variant.name, - variant.kind.as_deref().unwrap_or("snv"), - variant.ref_base, - variant.alt_base - ))); - } - Ok(()) -} - -fn variant_input_to_spec(variant: &VariantInput) -> Result { - let start = variant - .start - .or(variant.pos) - .ok_or_else(|| JsError::new(&format!("variant {}: start/pos missing", variant.name)))?; - let end = variant.end.unwrap_or(start); - let locus = GenomicLocus { - chrom: variant.chrom.clone(), - start, - end, - }; - let assembly = variant.assembly.as_deref().and_then(parse_assembly_str); - let kind = parse_variant_kind(variant.kind.as_deref()); - Ok(VariantSpec { - rsids: variant.rsid.clone().into_iter().collect(), - grch37: if assembly == Some(bioscript_core::Assembly::Grch37) { - Some(locus.clone()) - } else { - None - }, - grch38: if assembly == Some(bioscript_core::Assembly::Grch38) || assembly.is_none() { - Some(locus) - } else { - None - }, - reference: Some(variant.ref_base.clone()), - alternate: Some(variant.alt_base.clone()), - kind, - deletion_length: None, - motifs: Vec::new(), - }) -} - -fn parse_variant_kind(kind: Option<&str>) -> Option { - match kind.unwrap_or("").to_ascii_lowercase().as_str() { - "snp" | "snv" | "variant" | "" => Some(VariantKind::Snp), - "insertion" => Some(VariantKind::Insertion), - "deletion" => Some(VariantKind::Deletion), - "indel" => Some(VariantKind::Indel), - "other" => Some(VariantKind::Other), - _ => Some(VariantKind::Other), - } -} - -fn observation_to_js( - variant: VariantInput, - observation: VariantObservation, -) -> VariantObservationJs { - VariantObservationJs { - name: variant.name, - backend: observation.backend, - reference: Some(variant.ref_base), - alternate: Some(variant.alt_base), - matched_rsid: observation.matched_rsid, - assembly: observation.assembly.map(|a| render_assembly(a).to_owned()), - genotype: observation.genotype, - ref_count: observation.ref_count, - alt_count: observation.alt_count, - depth: observation.depth, - raw_counts: observation.raw_counts, - decision: observation.decision, - evidence: observation.evidence, - } -} - -fn parse_assembly_str(s: &str) -> Option { - match s.to_ascii_lowercase().as_str() { - "grch37" | "hg19" | "b37" => Some(bioscript_core::Assembly::Grch37), - "grch38" | "hg38" => Some(bioscript_core::Assembly::Grch38), - _ => None, - } -} - -// Wire types — we flatten the Rust FileInspection into the shape the app's -// TS Inspection type already expects (matches widgets/FilePicker/types.ts). -#[derive(Serialize)] -struct InspectionJs { - #[serde(rename = "fileName")] - file_name: String, - container: &'static str, - #[serde(rename = "detectedKind")] - detected_kind: &'static str, - confidence: &'static str, - assembly: Option<&'static str>, - phased: Option, - source: Option, - #[serde(rename = "selectedEntry", skip_serializing_if = "Option::is_none")] - selected_entry: Option, - #[serde(rename = "hasIndex", skip_serializing_if = "Option::is_none")] - has_index: Option, - #[serde(rename = "referenceMatches", skip_serializing_if = "Option::is_none")] - reference_matches: Option, - evidence: Vec, - warnings: Vec, - #[serde(rename = "durationMs")] - duration_ms: u128, -} - -#[derive(Serialize)] -struct SourceJs { - vendor: String, - #[serde(rename = "platformVersion", skip_serializing_if = "Option::is_none")] - platform_version: Option, - confidence: &'static str, - evidence: Vec, -} - -impl From for InspectionJs { - fn from(i: FileInspection) -> Self { - InspectionJs { - file_name: i.path.display().to_string(), - container: render_container(i.container), - detected_kind: render_kind(i.detected_kind), - confidence: render_confidence(i.confidence), - assembly: i.assembly.map(render_assembly), - phased: i.phased, - source: i.source.map(SourceJs::from), - selected_entry: i.selected_entry, - has_index: i.has_index, - reference_matches: i.reference_matches, - evidence: i.evidence, - warnings: i.warnings, - duration_ms: i.duration_ms, - } - } -} - -impl From for SourceJs { - fn from(s: SourceMetadata) -> Self { - SourceJs { - vendor: s.vendor.unwrap_or_default(), - platform_version: s.platform_version, - confidence: render_confidence(s.confidence), - evidence: s.evidence, - } - } -} - -fn render_container(c: FileContainer) -> &'static str { - match c { - FileContainer::Plain => "plain", - FileContainer::Zip => "zip", - } -} - -fn render_kind(k: DetectedKind) -> &'static str { - match k { - DetectedKind::GenotypeText => "genotype_text", - DetectedKind::Vcf => "vcf", - DetectedKind::AlignmentCram => "alignment_cram", - DetectedKind::AlignmentBam => "alignment_bam", - DetectedKind::ReferenceFasta => "reference_fasta", - DetectedKind::Unknown => "unknown", - } -} - -fn render_confidence(c: DetectionConfidence) -> &'static str { - match c { - DetectionConfidence::Authoritative => "authoritative", - DetectionConfidence::StrongHeuristic => "strong_heuristic", - DetectionConfidence::WeakHeuristic => "weak_heuristic", - DetectionConfidence::Unknown => "unknown", - } -} - -fn render_assembly(a: bioscript_core::Assembly) -> &'static str { - match a { - bioscript_core::Assembly::Grch37 => "grch37", - bioscript_core::Assembly::Grch38 => "grch38", - } -} diff --git a/rust/bioscript-wasm/src/lookup_api.rs b/rust/bioscript-wasm/src/lookup_api.rs new file mode 100644 index 0000000..5483eb5 --- /dev/null +++ b/rust/bioscript-wasm/src/lookup_api.rs @@ -0,0 +1,334 @@ +use std::io::BufReader; + +use bioscript_core::{GenomicLocus, VariantKind, VariantObservation, VariantSpec}; +use bioscript_formats::{ + alignment, observe_cram_indel_with_reader, observe_cram_snp_with_reader, + observe_vcf_snp_with_reader, GenotypeStore, +}; +use noodles::csi as noodles_csi; +use serde::{Deserialize, Serialize}; +use wasm_bindgen::prelude::*; + +use crate::{inspect_api::render_assembly, js_reader::JsReader}; + +#[derive(Deserialize)] +struct VariantInput { + name: String, + chrom: String, + // 1-based genomic interval. `pos` is accepted for older callers. + #[serde(default)] + pos: Option, + #[serde(default)] + start: Option, + #[serde(default)] + end: Option, + #[serde(rename = "ref")] + ref_base: String, + #[serde(rename = "alt")] + alt_base: String, + #[serde(default)] + rsid: Option, + #[serde(default)] + assembly: Option, + #[serde(default)] + kind: Option, +} + +#[derive(Serialize)] +struct VariantObservationJs { + name: String, + backend: String, + #[serde(rename = "ref", skip_serializing_if = "Option::is_none")] + reference: Option, + #[serde(rename = "alt", skip_serializing_if = "Option::is_none")] + alternate: Option, + #[serde(rename = "matchedRsid", skip_serializing_if = "Option::is_none")] + matched_rsid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + assembly: Option, + #[serde(skip_serializing_if = "Option::is_none")] + genotype: Option, + #[serde(rename = "refCount", skip_serializing_if = "Option::is_none")] + ref_count: Option, + #[serde(rename = "altCount", skip_serializing_if = "Option::is_none")] + alt_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + depth: Option, + #[serde(rename = "rawCounts")] + raw_counts: std::collections::BTreeMap, + #[serde(skip_serializing_if = "Option::is_none")] + decision: Option, + evidence: Vec, +} + +/// Observe a list of SNP variants against an indexed CRAM + reference FASTA, +/// with the bulk bytes pulled on demand via JS-supplied `readAt(offset, len)` +/// callbacks. The small index payloads (`.crai`, `.fai`) are passed inline. +/// +/// Both callbacks must return a `Uint8Array` synchronously (or via a Node +/// sync read) — wasm's `Read + Seek` contract is synchronous. Async reads are +/// a follow-up that needs buffered pre-fetch on the JS side. +#[wasm_bindgen(js_name = lookupCramVariants)] +pub fn lookup_cram_variants( + cram_read_at: js_sys::Function, + cram_len: f64, + crai_bytes: &[u8], + fasta_read_at: js_sys::Function, + fasta_len: f64, + fai_bytes: &[u8], + variants_json: &str, +) -> Result { + let crai_index = alignment::parse_crai_bytes(crai_bytes) + .map_err(|err| JsError::new(&format!("parse crai: {err:?}")))?; + let fai_index = alignment::parse_fai_bytes(fai_bytes) + .map_err(|err| JsError::new(&format!("parse fai: {err:?}")))?; + + let fasta_reader = BufReader::new(JsReader::new(fasta_read_at, fasta_len as u64, "fasta")); + let repository = alignment::build_reference_repository_from_readers(fasta_reader, fai_index); + + let cram_reader = JsReader::new(cram_read_at, cram_len as u64, "cram"); + let mut indexed = + alignment::build_cram_indexed_reader_from_reader(cram_reader, crai_index, repository) + .map_err(|err| JsError::new(&format!("build cram reader: {err:?}")))?; + + let variants = parse_variants_json(variants_json)?; + let mut results = Vec::with_capacity(variants.len()); + for variant in variants { + let assembly = variant.assembly.as_deref().and_then(parse_assembly_str); + let locus = variant_locus(&variant)?; + let kind = parse_variant_kind(variant.kind.as_deref()).unwrap_or(VariantKind::Snp); + let observation = match kind { + VariantKind::Snp => { + ensure_single_base_variant(&variant)?; + let ref_char = first_allele_char(&variant.name, &variant.ref_base, "ref")?; + let alt_char = first_allele_char(&variant.name, &variant.alt_base, "alt")?; + observe_cram_snp_with_reader( + &mut indexed, + &variant.name, + &locus, + ref_char, + alt_char, + variant.rsid.clone(), + assembly, + ) + } + VariantKind::Insertion | VariantKind::Indel => observe_cram_indel_with_reader( + &mut indexed, + &variant.name, + &locus, + &variant.ref_base, + &variant.alt_base, + variant.rsid.clone(), + assembly, + ), + other => { + return Err(JsError::new(&format!( + "variant {} has unsupported kind {:?} for web CRAM lookup", + variant.name, other + ))); + } + } + .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", variant.name)))?; + results.push(observation_to_js(variant, observation)); + } + + serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}"))) +} + +/// Observe a list of SNP variants against a bgzipped + tabix-indexed VCF, +/// with the bulk bytes pulled on demand via a JS-supplied `readAt(offset, len)` +/// callback. The small `.tbi` payload is passed inline. +/// +/// The reader must provide the VCF synchronously — on web this is a +/// `FileReaderSync`-backed callback running inside a Web Worker. +#[wasm_bindgen(js_name = lookupVcfVariants)] +pub fn lookup_vcf_variants( + vcf_read_at: js_sys::Function, + vcf_len: f64, + tbi_bytes: &[u8], + variants_json: &str, +) -> Result { + let tabix_index = alignment::parse_tbi_bytes(tbi_bytes) + .map_err(|err| JsError::new(&format!("parse tbi: {err:?}")))?; + let vcf_reader = JsReader::new(vcf_read_at, vcf_len as u64, "vcf"); + let mut indexed = noodles_csi::io::IndexedReader::new(vcf_reader, tabix_index); + + let variants = parse_variants_json(variants_json)?; + let mut results = Vec::with_capacity(variants.len()); + for variant in variants { + ensure_single_base_variant(&variant)?; + let ref_char = first_allele_char(&variant.name, &variant.ref_base, "ref")?; + let alt_char = first_allele_char(&variant.name, &variant.alt_base, "alt")?; + let assembly = variant.assembly.as_deref().and_then(parse_assembly_str); + let locus = variant_locus(&variant)?; + let observation = observe_vcf_snp_with_reader( + &mut indexed, + &variant.name, + &locus, + ref_char, + alt_char, + variant.rsid.clone(), + assembly, + ) + .map_err(|err| JsError::new(&format!("vcf lookup {}: {err:?}", variant.name)))?; + results.push(observation_to_js(variant, observation)); + } + + serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}"))) +} + +#[wasm_bindgen(js_name = lookupGenotypeBytesVariants)] +pub fn lookup_genotype_bytes_variants( + name: &str, + bytes: &[u8], + variants_json: &str, +) -> Result { + let store = GenotypeStore::from_bytes(name, bytes) + .map_err(|err| JsError::new(&format!("load genotype bytes {name}: {err:?}")))?; + let variants = parse_variants_json(variants_json)?; + let specs = variants + .iter() + .map(variant_input_to_spec) + .collect::, _>>()?; + let observations = store + .lookup_variants(&specs) + .map_err(|err| JsError::new(&format!("lookup genotype bytes {name}: {err:?}")))?; + let rows = variants + .into_iter() + .zip(observations) + .map(|(variant, observation)| observation_to_js(variant, observation)) + .collect::>(); + serde_json::to_string(&rows).map_err(|err| JsError::new(&format!("encode results: {err}"))) +} + +#[wasm_bindgen(js_name = lookupGenotypeBytesRsids)] +pub fn lookup_genotype_bytes_rsids( + name: &str, + bytes: &[u8], + rsids_json: &str, +) -> Result { + let store = GenotypeStore::from_bytes(name, bytes) + .map_err(|err| JsError::new(&format!("load genotype bytes {name}: {err:?}")))?; + let rsids: Vec = serde_json::from_str(rsids_json) + .map_err(|err| JsError::new(&format!("parse rsidsJson: {err}")))?; + let values = rsids + .iter() + .map(|rsid| { + store + .get(rsid) + .map_err(|err| JsError::new(&format!("lookup genotype rsid {rsid}: {err:?}"))) + }) + .collect::, _>>()?; + serde_json::to_string(&values).map_err(|err| JsError::new(&format!("encode results: {err}"))) +} + +fn parse_variants_json(variants_json: &str) -> Result, JsError> { + serde_json::from_str(variants_json) + .map_err(|err| JsError::new(&format!("parse variantsJson: {err}"))) +} + +fn variant_locus(variant: &VariantInput) -> Result { + let start = variant + .start + .or(variant.pos) + .ok_or_else(|| JsError::new(&format!("variant {}: start/pos missing", variant.name)))?; + let end = variant.end.unwrap_or(start); + Ok(GenomicLocus { + chrom: variant.chrom.clone(), + start, + end, + }) +} + +fn first_allele_char(variant_name: &str, allele: &str, label: &str) -> Result { + allele + .chars() + .next() + .ok_or_else(|| JsError::new(&format!("variant {variant_name}: empty {label}"))) +} + +fn ensure_single_base_variant(variant: &VariantInput) -> Result<(), JsError> { + let kind = variant + .kind + .as_deref() + .unwrap_or("snv") + .to_ascii_lowercase(); + let is_snp_kind = matches!(kind.as_str(), "snp" | "snv" | "variant" | ""); + if !is_snp_kind + || variant.ref_base.chars().count() != 1 + || variant.alt_base.chars().count() != 1 + { + return Err(JsError::new(&format!( + "variant {} has kind/ref/alt {} {}/{}; web CRAM/VCF lookup currently supports single-base SNV observations only", + variant.name, + variant.kind.as_deref().unwrap_or("snv"), + variant.ref_base, + variant.alt_base + ))); + } + Ok(()) +} + +fn variant_input_to_spec(variant: &VariantInput) -> Result { + let locus = variant_locus(variant)?; + let assembly = variant.assembly.as_deref().and_then(parse_assembly_str); + let kind = parse_variant_kind(variant.kind.as_deref()); + Ok(VariantSpec { + rsids: variant.rsid.clone().into_iter().collect(), + grch37: if assembly == Some(bioscript_core::Assembly::Grch37) { + Some(locus.clone()) + } else { + None + }, + grch38: if assembly == Some(bioscript_core::Assembly::Grch38) || assembly.is_none() { + Some(locus) + } else { + None + }, + reference: Some(variant.ref_base.clone()), + alternate: Some(variant.alt_base.clone()), + kind, + deletion_length: None, + motifs: Vec::new(), + }) +} + +fn parse_variant_kind(kind: Option<&str>) -> Option { + match kind.unwrap_or("").to_ascii_lowercase().as_str() { + "snp" | "snv" | "variant" | "" => Some(VariantKind::Snp), + "insertion" => Some(VariantKind::Insertion), + "deletion" => Some(VariantKind::Deletion), + "indel" => Some(VariantKind::Indel), + "other" => Some(VariantKind::Other), + _ => Some(VariantKind::Other), + } +} + +fn observation_to_js( + variant: VariantInput, + observation: VariantObservation, +) -> VariantObservationJs { + VariantObservationJs { + name: variant.name, + backend: observation.backend, + reference: Some(variant.ref_base), + alternate: Some(variant.alt_base), + matched_rsid: observation.matched_rsid, + assembly: observation.assembly.map(|a| render_assembly(a).to_owned()), + genotype: observation.genotype, + ref_count: observation.ref_count, + alt_count: observation.alt_count, + depth: observation.depth, + raw_counts: observation.raw_counts, + decision: observation.decision, + evidence: observation.evidence, + } +} + +fn parse_assembly_str(s: &str) -> Option { + match s.to_ascii_lowercase().as_str() { + "grch37" | "hg19" | "b37" => Some(bioscript_core::Assembly::Grch37), + "grch38" | "hg38" => Some(bioscript_core::Assembly::Grch38), + _ => None, + } +} diff --git a/rust/bioscript-wasm/src/variant_yaml.rs b/rust/bioscript-wasm/src/variant_yaml.rs new file mode 100644 index 0000000..0d3e70f --- /dev/null +++ b/rust/bioscript-wasm/src/variant_yaml.rs @@ -0,0 +1,86 @@ +use bioscript_schema::load_variant_manifest_text_for_lookup; +use serde::Serialize; +use wasm_bindgen::prelude::*; + +#[derive(Serialize)] +struct CompiledVariantSpecJs { + name: String, + chrom: String, + start: i64, + end: i64, + #[serde(rename = "ref")] + ref_base: String, + #[serde(rename = "alt")] + alt_base: String, + #[serde(skip_serializing_if = "Option::is_none")] + rsid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + assembly: Option, + #[serde(skip_serializing_if = "Option::is_none")] + kind: Option, +} + +#[wasm_bindgen(js_name = compileVariantYamlText)] +pub fn compile_variant_yaml_text(name: &str, text: &str) -> Result { + let manifest = load_variant_manifest_text_for_lookup(name, text) + .map_err(|err| JsError::new(&format!("compile variant YAML failed: {err}")))?; + let spec = manifest.spec; + let ref_base = spec + .reference + .clone() + .ok_or_else(|| JsError::new(&format!("variant {}: alleles.ref missing", manifest.name)))?; + let alt_base = spec + .alternate + .clone() + .ok_or_else(|| JsError::new(&format!("variant {}: alleles.alts missing", manifest.name)))?; + let rsid = spec.rsids.first().cloned(); + let kind = spec.kind.map(|kind| { + match kind { + bioscript_core::VariantKind::Snp => "snv", + bioscript_core::VariantKind::Insertion => "insertion", + bioscript_core::VariantKind::Deletion => "deletion", + bioscript_core::VariantKind::Indel => "indel", + bioscript_core::VariantKind::Other => "other", + } + .to_owned() + }); + let mut out = Vec::new(); + if let Some(locus) = spec.grch38 { + out.push(CompiledVariantSpecJs { + name: manifest.name.clone(), + chrom: locus.chrom, + start: locus.start, + end: locus.end, + ref_base: ref_base.clone(), + alt_base: alt_base.clone(), + rsid: rsid.clone(), + assembly: Some("grch38".to_owned()), + kind: kind.clone(), + }); + } + if let Some(locus) = spec.grch37 { + out.push(CompiledVariantSpecJs { + name: if out.is_empty() { + manifest.name.clone() + } else { + format!("{}_grch37", manifest.name) + }, + chrom: locus.chrom, + start: locus.start, + end: locus.end, + ref_base, + alt_base, + rsid, + assembly: Some("grch37".to_owned()), + kind, + }); + } + if out.is_empty() { + return Err(JsError::new(&format!( + "variant {} has no coordinates", + manifest.name + ))); + } + serde_json::to_string(&out) + .map_err(|err| JsError::new(&format!("failed to encode compiled variant: {err}"))) +} diff --git a/test.sh b/test.sh index d822949..8a376e7 100755 --- a/test.sh +++ b/test.sh @@ -35,11 +35,16 @@ cargo_test() { } if [[ "$LARGE" == "1" ]]; then - BIOSCRIPT_RUN_LARGE_TESTS=1 cargo_test -p bioscript-formats --test file_formats --test inspect -- --nocapture + export BIOSCRIPT_RUN_LARGE_TESTS=1 else - cargo_test -p bioscript-formats --test file_formats --test inspect -- --nocapture + unset BIOSCRIPT_RUN_LARGE_TESTS fi -cargo_test -p bioscript-cli --test cli -- --nocapture + +cargo_test -p bioscript-formats --test file_formats --lib --test inspect --test prepare -- --nocapture +cargo_test -p bioscript-cli --test cli --bin bioscript -- --nocapture +cargo_test -p bioscript-schema --test validate_variants -- --nocapture +cargo_test -p bioscript-core --lib --test source_size -- --nocapture +cargo_test -p bioscript-runtime --lib --test security --test resources_coverage -- --nocapture if [[ "$REPORT" == "1" ]]; then cargo build -p bioscript-cli