From 0e2e7bf36a98a4974381ac556ed66d580f7465de Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 14 May 2026 14:26:18 +1000 Subject: [PATCH 1/5] starting on variant catalogue --- docs/assay-schema.md | 16 + docs/variant-catalogue-schema.md | 154 +++++++ docs/variant-catalogue-schema.yaml | 26 ++ docs/variant-catalogue.md | 428 ++++++++++++++++++ rust/bioscript-cli/src/cli_bootstrap.rs | 47 +- rust/bioscript-cli/src/report_review.rs | 11 +- rust/bioscript-reporting/src/manifest.rs | 1 + .../src/manifest_provenance.rs | 75 +++ rust/bioscript-runtime/src/runtime.rs | 42 +- rust/bioscript-runtime/src/runtime/methods.rs | 83 ++++ rust/bioscript-runtime/src/runtime/objects.rs | 6 +- rust/bioscript-runtime/src/runtime/state.rs | 2 + rust/bioscript-schema/src/lib.rs | 8 +- rust/bioscript-schema/src/remote_resource.rs | 6 +- rust/bioscript-schema/src/validator.rs | 1 + .../src/validator_catalogue.rs | 176 +++++++ rust/bioscript-schema/src/validator_load.rs | 8 + rust/bioscript-schema/src/validator_panel.rs | 45 ++ rust/bioscript-schema/src/validator_parse.rs | 37 ++ rust/bioscript-schema/src/validator_roots.rs | 5 +- rust/bioscript-schema/src/validator_types.rs | 7 + .../tests/validate_variants.rs | 72 +++ .../src/report_workspace/analysis.rs | 1 + 23 files changed, 1235 insertions(+), 22 deletions(-) create mode 100644 docs/variant-catalogue-schema.md create mode 100644 docs/variant-catalogue-schema.yaml create mode 100644 docs/variant-catalogue.md create mode 100644 rust/bioscript-reporting/src/manifest_provenance.rs create mode 100644 rust/bioscript-schema/src/validator_catalogue.rs diff --git a/docs/assay-schema.md b/docs/assay-schema.md index a6d34bc..3deec23 100644 --- a/docs/assay-schema.md +++ b/docs/assay-schema.md @@ -43,6 +43,11 @@ analyses: - "g1-site-1.yaml" - "g1-site-2.yaml" - "g2-site.yaml" + assets: + - id: "variants" + path: "variants.tsv" + - id: "findings" + path: "findings.tsv" emits: - key: "apol1_status" label: "APOL1 status" @@ -84,10 +89,21 @@ Rules: - `path` points to a BioScript-compatible Python file - `output_format` is optional and defaults to `tsv`; use `json` or `jsonl` when the script writes structured JSON output - `derived_from` lists the variant YAML files used by the interpretation +- `assets` is optional and lists local files the analysis script can read through the injected `asset_paths` dict - `emits` is optional but recommended so report generators know which output columns to display and how to label them - `logic` is optional; use `logic.description` and `logic.source.url` to document where the script's derivation rules came from - Analysis rows may emit `notes` or `report_notes` as a reporting convention. HTML reports render those notes below the analysis table and omit them from the table columns; this avoids a manifest-level template language while still letting the script build human-readable text from computed values. +Analysis scripts receive these injected variables: + +- `input_file`: input genotype/VCF/CRAM path +- `output_file`: path the script must write +- `participant_id`: current participant/sample identifier +- `observations_file`: TSV containing the variant observations already gathered from assay/panel members for this participant +- `asset_paths`: dict from each `assets[].id` to the resolved local path + +This supports large catalogues where Rust/BioScript first gathers all variant observations, then Python joins those observations to attached TSV assets such as `findings.tsv`, `conditions.tsv`, or `rules.tsv` and emits derived classification rows. + ## Findings Use `findings` for evidence that binds either to a variant observation or an emitted analysis value. Keep the executable logic in `analyses`; keep PGx evidence and reporting semantics in YAML. diff --git a/docs/variant-catalogue-schema.md b/docs/variant-catalogue-schema.md new file mode 100644 index 0000000..759d986 --- /dev/null +++ b/docs/variant-catalogue-schema.md @@ -0,0 +1,154 @@ +# Variant Catalogue Schema + +For the assay/runtime design, including direct catalogue members and Python +analysis assets, see [variant-catalogue.md](variant-catalogue.md). + +This schema describes a compact, auditable source catalogue for projects with +many variants. It is intended for curation and packaging. Current BioScript +runtime lookup still consumes normal `bioscript:variant:1.0` variant manifests; +a packaging step can expand this catalogue into per-variant YAML files. + +Use `bioscript:variant-catalogue:1.0` when a project has more than about 10 +variants, or when repeated provenance and per-file boilerplate make review +harder than a tabular source. + +## Shape + +```yaml +schema: "bioscript:variant-catalogue:1.0" +version: "1.0" +name: "thalassemia-variants" + +variants: + source: "variants.tsv" + format: "tsv" + key: "variant_id" + columns: + variant_id: + role: "id" + required: true + rsid: + role: "identifier.rsid" + alts: + role: "alleles.alts" + list_separator: "|" + grch38_pos: + role: "coordinates.grch38.pos" + type: "integer" + +findings: + source: "findings.tsv" + format: "tsv" + key: "variant_id" + columns: + variant_id: + role: "variant.id" + required: true + finding_id: + role: "finding.id" + alt: + role: "finding.alt" + notes: + role: "finding.notes" + +provenance: + sources: + - id: "ithagenes" + kind: "database" + label: "IthaGenes" + url: "https://www.ithanet.eu/db/ithagenes?action=list" + - id: "dbsnp" + kind: "database" + label: "dbSNP" + url_template: "https://www.ncbi.nlm.nih.gov/snp/{rsid}" +``` + +## Required Fields + +- `schema`: must be `bioscript:variant-catalogue:1.0` +- `version`: must be `1.0` +- `name` +- `variants.source` + +`variants.format` and `findings.format`, when present, must be `tsv`. +`variants.columns` and `findings.columns` are recommended for real catalogues so +tools can validate and interpret TSV columns without relying on hardcoded column +names. + +## Recommended Files + +Keep files separate during curation: + +- `variants.yaml`: catalogue manifest and shared provenance +- `variants.tsv`: biological identity and matching fields +- `findings.tsv`: one interpretation row per finding + +Generate per-variant `bioscript:variant:1.0` YAML as a packaging/build artifact. + +## `variants.tsv` + +Recommended columns: + +```text +variant_id name gene rsid aliases kind ref alts observed_alts grch37_chrom grch37_pos grch37_start grch37_end grch38_chrom grch38_pos grch38_start grch38_end +``` + +Use `|` inside list cells such as `aliases`, `alts`, and `observed_alts`. +Use `pos` columns for SNVs and `start`/`end` columns for spans. + +Declare the semantic mapping in `variants.columns`. Recommended roles include: + +```text +id +name +gene +identifier.rsid +identifier.aliases +alleles.kind +alleles.ref +alleles.alts +alleles.observed_alts +coordinates.grch37.chrom +coordinates.grch37.pos +coordinates.grch37.start +coordinates.grch37.end +coordinates.grch38.chrom +coordinates.grch38.pos +coordinates.grch38.start +coordinates.grch38.end +``` + +## `findings.tsv` + +Recommended columns: + +```text +variant_id finding_id schema alt label summary notes +``` + +Additional source-specific columns such as `itha_id`, `functionality`, +`phenotype`, `transcript_hgvs`, `genomic_hgvs`, and `ncbi_spdi` are allowed as +curation data. The packaging tool can combine those atoms into standard variant +finding entries. + +Each `findings.tsv.variant_id` should match a row in `variants.tsv`. Each +allele-specific `alt` should be present in the corresponding variant row's +`alts`, unless `alt` is `*`. + +Declare the semantic mapping in `findings.columns`. Recommended roles include: + +```text +variant.id +finding.id +finding.schema +finding.alt +finding.label +finding.summary +finding.notes +source.itha_id +source.functionality +source.phenotype +source.transcript_hgvs +source.genomic_hgvs +source.ncbi_spdi +``` diff --git a/docs/variant-catalogue-schema.yaml b/docs/variant-catalogue-schema.yaml new file mode 100644 index 0000000..cfe5870 --- /dev/null +++ b/docs/variant-catalogue-schema.yaml @@ -0,0 +1,26 @@ +schema: "bioscript:variant-catalogue:1.0" +version: "1.0" +name: "example-variant-catalogue" +label: "Example Variant Catalogue" +summary: "Compact source catalogue for projects with many variants." + +variants: + source: "variants.tsv" + format: "tsv" + key: "variant_id" + +findings: + source: "findings.tsv" + format: "tsv" + key: "variant_id" + +provenance: + sources: + - id: "dbsnp" + kind: "database" + label: "dbSNP" + url_template: "https://www.ncbi.nlm.nih.gov/snp/{rsid}" + - id: "ncbi_variation" + kind: "database" + label: "NCBI Variation Services" + url_template: "https://api.ncbi.nlm.nih.gov/variation/v0/hgvs/{genomic_hgvs}/contextuals" diff --git a/docs/variant-catalogue.md b/docs/variant-catalogue.md new file mode 100644 index 0000000..ec133fd --- /dev/null +++ b/docs/variant-catalogue.md @@ -0,0 +1,428 @@ +# Variant Catalogue Design Note + +Variant catalogues are for assays and panels that need to test many variants +without making reviewers inspect hundreds of near-identical YAML files. The +curated source stays as plain text TSV plus a small YAML manifest. Build tools +can still expand the catalogue into normal `bioscript:variant:1.0` YAML files +when a package or older runtime path needs per-variant manifests. + +This is a design note for the intended BioScript flow. The catalogue manifest +schema already exists as `bioscript:variant-catalogue:1.0`; direct assay member +support for catalogues is the next BioScript schema/runtime change. + +## Why This Exists + +Small assays are easiest to review as hand-written variant YAML: + +```yaml +members: + - kind: "variant" + path: "variants/rs73885319.yaml" + version: "1.0" +``` + +That breaks down for catalogues such as thalassemia, where IthaGenes/dbSNP style +sources can produce hundreds of rows. Repeating the same provenance, source +URLs, and schema boilerplate in every YAML file makes review noisier. A TSV is +more compact, can be sorted, can be checked with command-line tools, and keeps +the source curation auditable. + +The runtime still needs structured observations and controlled file access. The +intended flow is: + +1. BioScript reads the assay. +2. BioScript expands catalogue members into variant lookup work. +3. BioScript gathers one observation row per variant for the participant. +4. BioScript mounts a small virtual filesystem for the analysis script. +5. BioScript writes observations to a conventional work path. +6. BioScript runs the analysis Python with a structured `bioscript.context`. +7. Python reads TSV inputs with `bioscript.read_tsv(path)`. +8. Python joins observations to attached TSV assets and emits report rows. + +## Required BioScript Schema Change + +Assay and panel `members` should accept a catalogue member: + +```yaml +members: + - kind: "variant-catalogue" + path: "catalogue/variants.yaml" + version: "1.0" +``` + +Today, `bioscript:assay:1.0` assay members accept `kind: "variant"` only, so the +direct member above is the required next schema change. The member should point +to the catalogue YAML manifest, not directly to `variants.tsv`, because the YAML +manifest holds schema identity, version, table locations, keys, and shared +provenance. + +The catalogue manifest shape is: + +```yaml +schema: "bioscript:variant-catalogue:1.0" +version: "1.0" +name: "thalassemia-variants" + +variants: + source: "variants.tsv" + format: "tsv" + key: "variant_id" + columns: + variant_id: + role: "id" + required: true + name: + role: "name" + gene: + role: "gene" + rsid: + role: "identifier.rsid" + aliases: + role: "identifier.aliases" + list_separator: "|" + kind: + role: "alleles.kind" + ref: + role: "alleles.ref" + alts: + role: "alleles.alts" + list_separator: "|" + observed_alts: + role: "alleles.observed_alts" + list_separator: "|" + grch37_chrom: + role: "coordinates.grch37.chrom" + grch37_pos: + role: "coordinates.grch37.pos" + type: "integer" + grch38_chrom: + role: "coordinates.grch38.chrom" + grch38_pos: + role: "coordinates.grch38.pos" + type: "integer" + +findings: + source: "findings.tsv" + format: "tsv" + key: "variant_id" + columns: + variant_id: + role: "variant.id" + required: true + finding_id: + role: "finding.id" + schema: + role: "finding.schema" + alt: + role: "finding.alt" + label: + role: "finding.label" + summary: + role: "finding.summary" + notes: + role: "finding.notes" + +provenance: + sources: + - id: "ithagenes" + kind: "database" + label: "IthaGenes" + url: "https://www.ithanet.eu/db/ithagenes?action=list" + - id: "dbsnp" + kind: "database" + label: "dbSNP" + url_template: "https://www.ncbi.nlm.nih.gov/snp/{rsid}" +``` + +## Column Mapping + +The manifest should declare how TSV columns map to BioScript concepts. This +keeps the TSV compact while avoiding hidden assumptions about column names. +Tools may provide default mappings for conventional columns, but real catalogues +should encode the mapping explicitly. + +Each entry under `variants.columns` or `findings.columns` is keyed by the TSV +column name: + +```yaml +variants: + source: "variants.tsv" + key: "variant_id" + columns: + variant_id: + role: "id" + required: true + rsid: + role: "identifier.rsid" + alts: + role: "alleles.alts" + list_separator: "|" + grch38_pos: + role: "coordinates.grch38.pos" + type: "integer" +``` + +Column mapping fields: + +- `role`: semantic target for this TSV column +- `required`: whether validation should fail if the column is missing or empty +- `type`: optional primitive validation, such as `string` or `integer` +- `list_separator`: separator for list-valued cells, normally `|` + +Recommended `variants.columns` roles: + +- `id` +- `name` +- `gene` +- `identifier.rsid` +- `identifier.aliases` +- `alleles.kind` +- `alleles.ref` +- `alleles.alts` +- `alleles.observed_alts` +- `coordinates.grch37.chrom` +- `coordinates.grch37.pos` +- `coordinates.grch37.start` +- `coordinates.grch37.end` +- `coordinates.grch38.chrom` +- `coordinates.grch38.pos` +- `coordinates.grch38.start` +- `coordinates.grch38.end` + +Recommended `findings.columns` roles: + +- `variant.id` +- `finding.id` +- `finding.schema` +- `finding.alt` +- `finding.label` +- `finding.summary` +- `finding.notes` +- `source.itha_id` +- `source.functionality` +- `source.phenotype` +- `source.transcript_hgvs` +- `source.genomic_hgvs` +- `source.ncbi_spdi` + +Unknown columns may still exist as source curation data, but executable tools +should only depend on columns declared in the manifest. Validation should check +that required mapped columns are present in the TSV header, integer columns parse +as integers when populated, and list-valued columns are split consistently. + +The assay should also allow analysis assets, which are files made available to +the Python analysis by id: + +```yaml +analyses: + - id: "thalassemia_status" + kind: "bioscript" + path: "thalassemia.py" + output_format: "tsv" + derived_from: + - "catalogue/variants.yaml" + assets: + - id: "variants" + path: "catalogue/variants.tsv" + - id: "findings" + path: "catalogue/findings.tsv" + emits: + - key: "thalassemia_status" + label: "Thalassemia status" + value_type: "string" + format: "badge" +``` + +`assets` are not variant members. They are analysis inputs. The variant +catalogue member tells BioScript what to look up; the pipeline files and assets +give Python the extra curation tables it needs to classify the observed +variants. + +## Analysis File Sandbox + +Analysis scripts should not receive arbitrary host filesystem paths. BioScript +should mount a small virtual filesystem with conventional paths: + +```text +/input +/input/pipeline +/work +/output +``` + +Root meanings: + +- `/input`: read-only run inputs, starting with the genotype/sample input file +- `/input/pipeline`: read-only assay package files, including YAML, TSV assets, + and analysis code +- `/work`: read/write runtime workspace for BioScript-generated intermediate + files such as observations +- `/output`: write/preserved output files and auxiliary artifacts + +Initial conventional paths: + +```text +/input/genotypes +/input/participant.tsv +/input/pipeline/assay.yaml +/input/pipeline/catalogue.yaml +/input/pipeline/variants.tsv +/input/pipeline/findings.tsv +/work/observations.tsv +/work/analysis.jsonl +/output/results.tsv +``` + +`/input/participant.tsv` and `/work/analysis.jsonl` are optional future +conventions. They can be absent; scripts should check with `bioscript.exists()`. + +Access rules: + +- `/input` and `/input/pipeline` are read-only +- `/work` is read/write and intended for intermediate files +- `/output` is write/preserved; files here are report artifacts +- host paths, path traversal, and network paths are not available to analysis + scripts + +`/output/results.tsv` is the canonical report output when the script writes a +file. Other files under `/output` are auxiliary artifacts. If a future runtime +captures returned rows/dicts directly, it should reject ambiguous scripts that +both return a primary result and write `/output/results.tsv`. + +## `bioscript.context` + +BioScript should expose a structured context object so scripts can interrogate +the run without hardcoding every path: + +```python +bioscript.context["participant_id"] +bioscript.context["input_files"] +bioscript.context["pipeline_files"] +bioscript.context["assets"] +bioscript.context["observations_file"] +bioscript.context["output_file"] +``` + +Example context shape: + +```python +{ + "participant_id": "genome_hu50B3F5_v5_Full", + "input_files": { + "genotypes": "/input/genotypes", + }, + "pipeline_files": { + "assay": "/input/pipeline/assay.yaml", + "catalogue": "/input/pipeline/catalogue.yaml", + }, + "assets": { + "variants": "/input/pipeline/variants.tsv", + "findings": "/input/pipeline/findings.tsv", + }, + "observations_file": "/work/observations.tsv", + "output_file": "/output/results.tsv", +} +``` + +Compatibility variables may still be injected during migration: + +```python +participant_id = bioscript.context["participant_id"] +observations_file = bioscript.context["observations_file"] +output_file = bioscript.context["output_file"] +asset_paths = bioscript.context["assets"] +``` + +The context object should be read-only from the script's point of view. It can +also expose helper methods later, but a dictionary shape is enough for the first +implementation. + +## Thalassemia Use Case + +The thalassemia project is the motivating case: + +- `catalogue/variants.yaml` stores shared catalogue metadata and provenance. +- `catalogue/variants.tsv` stores variant identity, rsIDs, aliases, alleles, + coordinates, and source curation columns. +- `catalogue/findings.tsv` stores what a variant contributes to: for example + alpha-globin, beta-globin, or other globin/modifier categories, with source + notes such as IthaGenes functionality and phenotype. +- `thalassemia.py` reads `observations_file` plus the TSV assets and emits a + compact classification row. + +This lets the assay report a derived classification such as: + +- no catalogue thalassemia variant observed +- possible beta carrier +- possible biallelic HBB risk +- possible alpha silent carrier or trait +- possible HbH/Hb Bart spectrum + +Those labels require careful clinical curation. The important first step is the +data path: variants become observations, then Python joins observations to +attached TSV assets and emits rows. The exact classification rules need separate +review and should not be inferred from row count alone. + +## Minimal Python Analysis + +This is intentionally a hello-world style analysis. It proves how the injected +files work, but the real thalassemia logic needs proper allele, phase, gene, and +evidence handling. + +```python +import bioscript + + +observations = bioscript.read_tsv(bioscript.context["observations_file"]) +variants = bioscript.read_tsv(bioscript.context["assets"]["variants"]) +findings = bioscript.read_tsv(bioscript.context["assets"]["findings"]) + +# Minimal proof of shape only. Real work should: +# - match observations to variant_id via rsid, aliases, and coordinates +# - verify the observed allele matches the finding allele +# - handle deletions, insertions, complex alleles, no-calls, and phase +# - classify alpha-globin, beta-globin, and modifier findings separately +# - preserve provenance and source notes in emitted rows + +if len(observations) > 0 and len(variants) > 0 and len(findings) > 0: + status = "catalogue_assets_loaded" +else: + status = "catalogue_assets_missing_or_empty" + +bioscript.write_tsv(bioscript.context["output_file"], [ + { + "participant_id": bioscript.context["participant_id"], + "thalassemia_status": status, + } +]) +``` + +The analysis script receives virtual file paths as strings through +`bioscript.context`. `observations_file` is a TSV created by BioScript for the +current participant. `assets` is a dictionary whose keys are the ids from +`analyses[].assets`. + +BioScript should provide `bioscript.read_tsv(path)` as a built-in helper. TSV is +part of the catalogue asset contract, so every analysis should not need to carry +its own parser. The helper should return a list of dictionaries keyed by the TSV +header row, use tab as the delimiter, preserve empty cells as empty strings, and +skip empty lines. Later versions may add typed parsing based on catalogue column +metadata, but the default helper should stay predictable and string-based. + +## Packaging And Review + +Keep these files separate during curation: + +```text +catalogue/variants.yaml +catalogue/variants.tsv +catalogue/findings.tsv +``` + +Generated per-variant YAML files are useful as a packaging artifact, especially +while older runtime paths still expect `kind: "variant"` members. They should not +be the primary review surface for large catalogues. + +The long-term runtime path should be better than generated YAML for large +catalogues: BioScript can stream or batch the TSV-backed catalogue directly into +lookups, then pass observations and assets to the analysis script. diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs index 63bae41..7e7024a 100644 --- a/rust/bioscript-cli/src/cli_bootstrap.rs +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -57,13 +57,15 @@ fn run_cli() -> Result<(), String> { Ok(()) } -const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--allow-md5-mismatch] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; +const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--observations-file ] [--asset id=path] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--allow-md5-mismatch] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; struct CliOptions { script_path: Option, root: Option, input_file: Option, output_file: Option, + observations_file: Option, + asset_paths: BTreeMap, participant_id: Option, trace_report: Option, timing_report: Option, @@ -107,6 +109,8 @@ fn default_cli_options() -> CliOptions { root: None, input_file: None, output_file: None, + observations_file: None, + asset_paths: BTreeMap::new(), participant_id: None, trace_report: None, timing_report: None, @@ -167,6 +171,24 @@ fn parse_cli_path_arg( return Err("--output-file requires a path".to_owned()); }; options.output_file = Some(value); + } else if arg == "--observations-file" { + let Some(value) = args.next() else { + return Err("--observations-file requires a path".to_owned()); + }; + options.observations_file = Some(value); + } else if arg == "--asset" { + let Some(value) = args.next() else { + return Err("--asset requires id=path".to_owned()); + }; + let Some((id, path)) = value.split_once('=') else { + return Err("--asset requires id=path".to_owned()); + }; + let id = id.trim(); + let path = path.trim(); + if id.is_empty() || path.is_empty() { + return Err("--asset requires non-empty id=path".to_owned()); + } + options.asset_paths.insert(id.to_owned(), path.to_owned()); } else if arg == "--participant-id" { let Some(value) = args.next() else { return Err("--participant-id requires a value".to_owned()); @@ -380,6 +402,15 @@ fn run_cli_script( if let Some(output_file) = options.output_file { inputs.push(("output_file", monty::MontyObject::String(output_file))); } + if let Some(observations_file) = options.observations_file { + inputs.push(( + "observations_file", + monty::MontyObject::String(observations_file), + )); + } + if !options.asset_paths.is_empty() { + inputs.push(("asset_paths", monty_string_dict(&options.asset_paths))); + } if let Some(participant_id) = options.participant_id { inputs.push(("participant_id", monty::MontyObject::String(participant_id))); } @@ -395,6 +426,20 @@ fn run_cli_script( Ok(()) } +fn monty_string_dict(values: &BTreeMap) -> monty::MontyObject { + monty::MontyObject::Dict( + values + .iter() + .map(|(key, value)| { + ( + monty::MontyObject::String(key.clone()), + monty::MontyObject::String(value.clone()), + ) + }) + .collect(), + ) +} + fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { if let Some(parent) = path.parent() { fs::create_dir_all(parent).map_err(|err| { diff --git a/rust/bioscript-cli/src/report_review.rs b/rust/bioscript-cli/src/report_review.rs index d10932c..ce413bc 100644 --- a/rust/bioscript-cli/src/report_review.rs +++ b/rust/bioscript-cli/src/report_review.rs @@ -94,13 +94,14 @@ fn generate_review_report(options: &ReviewReportOptions) -> Result<(), String> { let input_bytes = review_case_genotype_text(&case); let store = GenotypeStore::from_bytes(&format!("{}.txt", case.id), input_bytes.as_bytes()) .map_err(|err| err.to_string())?; - let input_observations = run_manifest_rows_with_store( + let manifest_rows = run_manifest_rows_with_store( &options.root, &options.manifest_path, &store, &case.id, &options.filters, - )? + )?; + let input_observations = manifest_rows .iter() .map(|row| { app_observation_from_manifest_row( @@ -114,7 +115,7 @@ fn generate_review_report(options: &ReviewReportOptions) -> Result<(), String> { .collect::, _>>()?; observations.extend(input_observations.clone()); - let input_analyses = run_review_analyses(options, &case, &input_bytes)?; + let input_analyses = run_review_analyses(options, &case, &input_bytes, &manifest_rows)?; analyses.extend(input_analyses.clone()); let synthetic_input = PathBuf::from(format!("review://{}", case.id)); let synthetic_input_name = synthetic_input @@ -206,6 +207,7 @@ fn run_review_analyses( options: &ReviewReportOptions, case: &ReviewCase, input_bytes: &str, + observation_rows: &[BTreeMap], ) -> Result, String> { let temp_dir = options.output_dir.join(".review-temp"); fs::create_dir_all(&temp_dir).map_err(|err| { @@ -221,14 +223,13 @@ fn run_review_analyses( format: Some(GenotypeSourceFormat::Text), ..GenotypeLoadOptions::default() }; - let observation_rows = Vec::new(); let analysis_options = ReportAnalysisOptions { runtime_root: &options.root, input_file: &temp_path, participant_id: &case.id, loader: &loader, output_dir: &options.output_dir, - observation_rows: &observation_rows, + observation_rows, filters: &options.filters, max_duration_ms: 1_000, }; diff --git a/rust/bioscript-reporting/src/manifest.rs b/rust/bioscript-reporting/src/manifest.rs index 87baadc..06e0645 100644 --- a/rust/bioscript-reporting/src/manifest.rs +++ b/rust/bioscript-reporting/src/manifest.rs @@ -8,6 +8,7 @@ use bioscript_schema::{ load_variant_manifest_text, }; +#[path = "manifest_provenance.rs"] mod provenance; pub use provenance::{collect_manifest_provenance_entries, load_manifest_provenance_links}; diff --git a/rust/bioscript-reporting/src/manifest_provenance.rs b/rust/bioscript-reporting/src/manifest_provenance.rs new file mode 100644 index 0000000..02d81c1 --- /dev/null +++ b/rust/bioscript-reporting/src/manifest_provenance.rs @@ -0,0 +1,75 @@ +use std::collections::BTreeMap; + +use super::{ + ManifestWorkspace, manifest_supports_findings, traversable_manifest_member_paths, yaml_string, + yaml_to_json, +}; + +pub fn load_manifest_provenance_links( + workspace: &impl ManifestWorkspace, + path: &str, +) -> Result, String> { + let value = workspace.load_yaml(path)?; + let schema = yaml_string(&value, "schema").unwrap_or_default(); + let mut links = BTreeMap::new(); + + collect_manifest_provenance_entries(&value, &mut links)?; + + if manifest_supports_findings(&schema) + && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + if let Some(include) = item.get("include").and_then(serde_yaml::Value::as_str) { + let include_path = workspace.resolve(path, include)?; + let included = load_manifest_provenance_links(workspace, &include_path)?; + for source in included { + insert_provenance_link(&mut links, source); + } + } + } + } + + for member_path in traversable_manifest_member_paths(&schema, &value) { + let resolved = workspace.resolve(path, member_path)?; + let member_links = load_manifest_provenance_links(workspace, &resolved)?; + for source in member_links { + insert_provenance_link(&mut links, source); + } + } + + Ok(links.into_values().collect()) +} + +pub fn collect_manifest_provenance_entries( + value: &serde_yaml::Value, + links: &mut BTreeMap, +) -> Result<(), String> { + let Some(sources) = value + .get("provenance") + .and_then(|provenance| provenance.get("sources")) + .and_then(serde_yaml::Value::as_sequence) + else { + return Ok(()); + }; + + for source in sources { + let source = yaml_to_json(source.clone())?; + insert_provenance_link(links, source); + } + Ok(()) +} + +fn insert_provenance_link( + links: &mut BTreeMap, + source: serde_json::Value, +) { + let key = source + .get("url") + .or_else(|| source.get("url_template")) + .or_else(|| source.get("label")) + .and_then(serde_json::Value::as_str) + .map_or_else(|| source.to_string(), ToOwned::to_owned); + links.entry(key).or_insert(source); +} diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index a5e0e11..69f98a1 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -134,7 +134,16 @@ impl BioscriptRuntime { "__file__", MontyObject::String(script_path.display().to_string()), )); - extra_inputs.push(("bioscript", bioscript_object())); + extra_inputs.push(( + "bioscript", + bioscript_object(MontyObject::Dict( + self.config + .context + .iter() + .map(|(key, value)| (MontyObject::String(key.clone()), value.clone())) + .collect(), + )), + )); let result = self.run_script( &instrumented, @@ -247,8 +256,10 @@ impl BioscriptRuntime { ("Bioscript", "variant") => self.method_variant(args, kwargs), ("Bioscript", "query_plan") => self.method_query_plan(args, kwargs), ("Bioscript", "write_tsv") => self.method_write_tsv(args, kwargs), + ("Bioscript", "read_tsv") => self.method_read_tsv(args, kwargs), ("Bioscript", "read_text") => self.method_read_text(args, kwargs), ("Bioscript", "write_text") => self.method_write_text(args, kwargs), + ("Bioscript", "exists") => self.method_exists(args, kwargs), ("GenotypeFile", "get") => self.method_genotype_get(args, kwargs), ("GenotypeFile", "lookup_variant") => self.method_genotype_lookup_variant(args, kwargs), ("GenotypeFile", "lookup_variant_details") => { @@ -281,6 +292,19 @@ impl BioscriptRuntime { fn resolve_user_path(&self, raw_path: &str) -> Result { let path = Path::new(raw_path); if path.is_absolute() { + if self.uses_virtual_files() { + for component in path.components() { + match component { + Component::ParentDir | Component::Prefix(_) => { + return Err(RuntimeError::InvalidArguments(format!( + "path escapes bioscript root: {raw_path}" + ))); + } + Component::RootDir | Component::CurDir | Component::Normal(_) => {} + } + } + return Ok(path.to_path_buf()); + } return Err(RuntimeError::InvalidArguments(format!( "absolute paths are not allowed: {raw_path}" ))); @@ -303,6 +327,11 @@ impl BioscriptRuntime { if self.virtual_file_exists(raw_path) { return Ok(path); } + if self.uses_virtual_files() && Path::new(raw_path).is_absolute() { + return Err(RuntimeError::Io(format!( + "virtual file does not exist: {raw_path}" + ))); + } let canonical = path.canonicalize().map_err(|err| { RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) })?; @@ -313,6 +342,13 @@ impl BioscriptRuntime { fn resolve_user_write_path(&self, raw_path: &str) -> Result { let path = self.resolve_user_path(raw_path)?; if self.uses_virtual_files() { + if Path::new(raw_path).is_absolute() + && !(raw_path.starts_with("/output/") || raw_path.starts_with("/work/")) + { + return Err(RuntimeError::InvalidArguments(format!( + "virtual write path must be under /work or /output: {raw_path}" + ))); + } return Ok(path); } if path.exists() { @@ -773,7 +809,7 @@ mod tests { ) .unwrap(); let runtime = BioscriptRuntime::new(&root).unwrap(); - let bioscript = bioscript_object(); + let bioscript = bioscript_object(MontyObject::Dict(Vec::new().into())); let genotype = runtime .method_load_genotypes( @@ -853,7 +889,7 @@ mod tests { fn runtime_private_methods_cover_successful_text_tsv_and_trace_paths() { let root = temp_dir("host-output"); let runtime = BioscriptRuntime::new(&root).unwrap(); - let bioscript = bioscript_object(); + let bioscript = bioscript_object(MontyObject::Dict(Vec::new().into())); let rows = MontyObject::List(vec![MontyObject::Dict( vec![ ( diff --git a/rust/bioscript-runtime/src/runtime/methods.rs b/rust/bioscript-runtime/src/runtime/methods.rs index d9490d3..1a48a7a 100644 --- a/rust/bioscript-runtime/src/runtime/methods.rs +++ b/rust/bioscript-runtime/src/runtime/methods.rs @@ -358,6 +358,60 @@ impl BioscriptRuntime { host_read_text(self, &args[1..], kwargs) } + pub(super) fn method_read_tsv( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + reject_kwargs(kwargs, "bioscript.read_tsv")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "bioscript.read_tsv expects self and path".to_owned(), + )); + } + let path = + self.resolve_existing_user_path(&expect_string_arg(args, 1, "bioscript.read_tsv")?)?; + let text = if let Some(content) = self.read_virtual_text_file(&path) { + content + } else { + fs::read_to_string(&path).map_err(|err| { + RuntimeError::Io(format!("failed to read {}: {err}", path.display())) + })? + }; + Ok(tsv_rows_object(&text)) + } + + pub(super) fn method_exists( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + reject_kwargs(kwargs, "bioscript.exists")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "bioscript.exists expects self and path".to_owned(), + )); + } + let raw_path = expect_string_arg(args, 1, "bioscript.exists")?; + if self.config.virtual_text_files.contains_key(&raw_path) + || self.config.virtual_binary_files.contains_key(&raw_path) + || self + .state + .virtual_written_text_files + .lock() + .expect("virtual file mutex poisoned") + .contains_key(&raw_path) + { + return Ok(MontyObject::Bool(true)); + } + if self.config.virtual_text_files.is_empty() && self.config.virtual_binary_files.is_empty() + { + let path = self.resolve_user_path(&raw_path)?; + return Ok(MontyObject::Bool(path.exists())); + } + Ok(MontyObject::Bool(false)) + } + pub(super) fn method_write_text( &self, args: &[MontyObject], @@ -371,3 +425,32 @@ impl BioscriptRuntime { host_write_text(self, &args[1..], kwargs) } } + +fn tsv_rows_object(text: &str) -> MontyObject { + let mut lines = text.lines().filter(|line| !line.trim().is_empty()); + let Some(header_line) = lines.next() else { + return MontyObject::List(Vec::new()); + }; + let headers: Vec<&str> = header_line.split('\t').collect(); + MontyObject::List( + lines + .map(|line| { + let cells: Vec<&str> = line.split('\t').collect(); + MontyObject::Dict( + headers + .iter() + .enumerate() + .map(|(idx, header)| { + ( + MontyObject::String((*header).to_owned()), + MontyObject::String( + cells.get(idx).copied().unwrap_or_default().to_owned(), + ), + ) + }) + .collect(), + ) + }) + .collect(), + ) +} diff --git a/rust/bioscript-runtime/src/runtime/objects.rs b/rust/bioscript-runtime/src/runtime/objects.rs index 818ccdb..ae30b06 100644 --- a/rust/bioscript-runtime/src/runtime/objects.rs +++ b/rust/bioscript-runtime/src/runtime/objects.rs @@ -1,12 +1,12 @@ use bioscript_core::{VariantKind, VariantSpec}; use monty::MontyObject; -pub(crate) fn bioscript_object() -> MontyObject { +pub(crate) fn bioscript_object(context: MontyObject) -> MontyObject { MontyObject::Dataclass { name: "Bioscript".to_owned(), type_id: 1, - field_names: vec![], - attrs: vec![].into(), + field_names: vec!["context".to_owned()], + attrs: vec![(MontyObject::String("context".to_owned()), context)].into(), frozen: true, } } diff --git a/rust/bioscript-runtime/src/runtime/state.rs b/rust/bioscript-runtime/src/runtime/state.rs index 94adc87..222ded0 100644 --- a/rust/bioscript-runtime/src/runtime/state.rs +++ b/rust/bioscript-runtime/src/runtime/state.rs @@ -17,6 +17,7 @@ use bioscript_core::RuntimeError; pub struct RuntimeConfig { pub limits: ResourceLimits, pub loader: GenotypeLoadOptions, + pub context: BTreeMap, pub virtual_binary_files: BTreeMap>, pub virtual_text_files: BTreeMap, /// Observations the host has already resolved before invoking the @@ -38,6 +39,7 @@ impl Default for RuntimeConfig { Self { limits, loader: GenotypeLoadOptions::default(), + context: BTreeMap::new(), virtual_binary_files: BTreeMap::new(), virtual_text_files: BTreeMap::new(), preloaded_observations: Vec::new(), diff --git a/rust/bioscript-schema/src/lib.rs b/rust/bioscript-schema/src/lib.rs index b2d70c1..2cb05da 100644 --- a/rust/bioscript-schema/src/lib.rs +++ b/rust/bioscript-schema/src/lib.rs @@ -5,10 +5,10 @@ pub use remote_resource::{ RemoteDependency, RemoteResourceKind, RemoteResourceResolution, resolve_remote_resource_text, }; pub use validator::{ - AssayManifest, Download, FileReport, Issue, PanelInterpretation, PanelInterpretationLogic, - PanelInterpretationLogicSource, PanelManifest, PanelMember, Permissions, Severity, - ValidationReport, VariantManifest, load_assay_manifest, load_assay_manifest_text, - load_panel_manifest, load_panel_manifest_text, load_variant_manifest, + AssayManifest, Download, FileReport, Issue, PanelInterpretation, PanelInterpretationAsset, + PanelInterpretationLogic, PanelInterpretationLogicSource, PanelManifest, PanelMember, + Permissions, Severity, ValidationReport, VariantManifest, load_assay_manifest, + load_assay_manifest_text, load_panel_manifest, load_panel_manifest_text, load_variant_manifest, load_variant_manifest_text, load_variant_manifest_text_for_lookup, validate_assays_path, validate_panels_path, validate_variants_path, }; diff --git a/rust/bioscript-schema/src/remote_resource.rs b/rust/bioscript-schema/src/remote_resource.rs index 40ad8d1..33d0b1b 100644 --- a/rust/bioscript-schema/src/remote_resource.rs +++ b/rust/bioscript-schema/src/remote_resource.rs @@ -104,15 +104,15 @@ fn infer_kind(name: &str, schema: Option<&str>, parsed: Option<&Value>) -> Remot } if let Some(schema) = schema { let lower = schema.to_ascii_lowercase(); + if lower.contains("catalogue") || lower.contains("catalog") || lower.contains("index") { + return RemoteResourceKind::Catalogue; + } if lower.contains("variant") { return RemoteResourceKind::Variant; } if lower.contains("panel") { return RemoteResourceKind::Panel; } - if lower.contains("catalogue") || lower.contains("catalog") || lower.contains("index") { - return RemoteResourceKind::Catalogue; - } if lower.contains("assay") { return RemoteResourceKind::Assay; } diff --git a/rust/bioscript-schema/src/validator.rs b/rust/bioscript-schema/src/validator.rs index 0bae7f6..278685f 100644 --- a/rust/bioscript-schema/src/validator.rs +++ b/rust/bioscript-schema/src/validator.rs @@ -5,6 +5,7 @@ include!("validator_types.rs"); include!("validator_load.rs"); include!("validator_roots.rs"); include!("validator_alleles_findings.rs"); +include!("validator_catalogue.rs"); include!("validator_panel.rs"); include!("validator_parse.rs"); include!("validator_helpers.rs"); diff --git a/rust/bioscript-schema/src/validator_catalogue.rs b/rust/bioscript-schema/src/validator_catalogue.rs new file mode 100644 index 0000000..b348ad9 --- /dev/null +++ b/rust/bioscript-schema/src/validator_catalogue.rs @@ -0,0 +1,176 @@ +fn validate_variant_catalogue_root(root: &Value, issues: &mut Vec) { + require_const(root, &["schema"], "bioscript:variant-catalogue:1.0", issues); + require_const(root, &["version"], "1.0", issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + require_path(root, &["name"], issues); + validate_tags(root, issues); + validate_catalogue_table(root, "variants", true, issues); + validate_catalogue_table(root, "findings", false, issues); + validate_catalogue_provenance(root, issues); +} + +fn validate_catalogue_table( + root: &Value, + field: &str, + required: bool, + issues: &mut Vec, +) { + let Some(value) = value_at(root, &[field]) else { + if required { + issues.push(Issue { + severity: Severity::Error, + path: field.to_owned(), + message: "missing required table declaration".to_owned(), + }); + } + return; + }; + let Some(mapping) = value.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: field.to_owned(), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_required_non_empty_mapping_string(mapping, field, "source", issues); + validate_optional_catalogue_mapping_string(mapping, field, "format", issues); + validate_optional_catalogue_mapping_string(mapping, field, "key", issues); + + if let Some(source) = mapping + .get(Value::String("source".to_owned())) + .and_then(Value::as_str) + && !std::path::Path::new(source) + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("tsv")) + { + issues.push(Issue { + severity: Severity::Warning, + path: format!("{field}.source"), + message: "expected a .tsv source for auditable tabular catalogue data".to_owned(), + }); + } + if let Some(format) = mapping + .get(Value::String("format".to_owned())) + .and_then(Value::as_str) + && format != "tsv" + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{field}.format"), + message: "expected 'tsv'".to_owned(), + }); + } +} + +fn validate_catalogue_provenance(root: &Value, issues: &mut Vec) { + let Some(sources) = value_at(root, &["provenance", "sources"]).and_then(Value::as_sequence) + else { + return; + }; + let mut ids = BTreeSet::new(); + for (idx, source) in sources.iter().enumerate() { + let Some(mapping) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["id", "kind", "label"] { + validate_required_non_empty_mapping_string( + mapping, + &format!("provenance.sources[{idx}]"), + field, + issues, + ); + } + if let Some(id) = mapping + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + && !ids.insert(id.to_owned()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].id"), + message: format!("duplicate provenance source id '{id}'"), + }); + } + + let url = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str); + let url_template = mapping + .get(Value::String("url_template".to_owned())) + .and_then(Value::as_str); + match (url, url_template) { + (Some(value), _) => validate_url_string( + value, + &format!("provenance.sources[{idx}].url"), + false, + issues, + ), + (None, Some(value)) => validate_url_template( + value, + &format!("provenance.sources[{idx}].url_template"), + issues, + ), + (None, None) => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}]"), + message: "expected url or url_template".to_owned(), + }), + } + } +} + +fn validate_url_template(value: &str, path: &str, issues: &mut Vec) { + let sample = value + .replace("{rsid}", "rs1") + .replace("{genomic_hgvs}", "NG_000001.1%3Ag.1A%3ET") + .replace("{variant_id}", "rs1"); + validate_url_string(&sample, path, false, issues); +} + +fn validate_required_non_empty_mapping_string( + mapping: &Mapping, + parent: &str, + field: &str, + issues: &mut Vec, +) { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "missing required field".to_owned(), + }), + } +} + +fn validate_optional_catalogue_mapping_string( + mapping: &Mapping, + parent: &str, + field: &str, + issues: &mut Vec, +) { + let Some(value) = mapping.get(Value::String(field.to_owned())) else { + return; + }; + if !matches!(value, Value::String(text) if !text.trim().is_empty()) { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "expected non-empty string".to_owned(), + }); + } +} diff --git a/rust/bioscript-schema/src/validator_load.rs b/rust/bioscript-schema/src/validator_load.rs index 776314f..699e9ee 100644 --- a/rust/bioscript-schema/src/validator_load.rs +++ b/rust/bioscript-schema/src/validator_load.rs @@ -329,6 +329,14 @@ fn validate_variant_file(path: &Path) -> Result { }], }); }; + if schema == "bioscript:variant-catalogue:1.0" { + let mut issues = Vec::new(); + validate_variant_catalogue_root(&value, &mut issues); + return Ok(FileReport { + file: path.to_path_buf(), + issues, + }); + } if !schema.contains("variant") { if schema == "bioscript:pgx-findings:1.0" { let mut issues = Vec::new(); diff --git a/rust/bioscript-schema/src/validator_panel.rs b/rust/bioscript-schema/src/validator_panel.rs index ba6f9e5..4c23b12 100644 --- a/rust/bioscript-schema/src/validator_panel.rs +++ b/rust/bioscript-schema/src/validator_panel.rs @@ -250,9 +250,54 @@ fn validate_panel_interpretation( } } validate_panel_interpretation_logic(key, idx, mapping, issues); + validate_panel_interpretation_assets(key, idx, mapping, issues); validate_panel_interpretation_emits(key, idx, mapping, issues); } +fn validate_panel_interpretation_assets( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + let Some(assets) = mapping + .get(Value::String("assets".to_owned())) + .and_then(Value::as_sequence) + else { + return; + }; + let mut ids = BTreeSet::new(); + for (asset_idx, asset) in assets.iter().enumerate() { + let Some(asset) = asset.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].assets[{asset_idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["id", "path"] { + validate_required_mapping_string( + asset, + field, + &format!("{key}[{idx}].assets[{asset_idx}]"), + issues, + ); + } + if let Some(id) = asset + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + && !ids.insert(id.to_owned()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].assets[{asset_idx}].id"), + message: format!("duplicate asset id '{id}'"), + }); + } + } +} + fn validate_panel_interpretation_logic( key: &str, idx: usize, diff --git a/rust/bioscript-schema/src/validator_parse.rs b/rust/bioscript-schema/src/validator_parse.rs index b959518..f7f4f16 100644 --- a/rust/bioscript-schema/src/validator_parse.rs +++ b/rust/bioscript-schema/src/validator_parse.rs @@ -82,6 +82,7 @@ fn parse_panel_interpretations(root: &Value) -> Result, .and_then(Value::as_str) .map(ToOwned::to_owned), derived_from: mapping_sequence_of_strings(mapping, "derived_from", idx, key)?, + assets: parse_panel_interpretation_assets(mapping, idx, key)?, emits: parse_panel_interpretation_emits(mapping, idx)?, logic: parse_panel_interpretation_logic(mapping)?, }); @@ -89,6 +90,42 @@ fn parse_panel_interpretations(root: &Value) -> Result, Ok(interpretations) } +fn parse_panel_interpretation_assets( + mapping: &Mapping, + interpretation_idx: usize, + key: &str, +) -> Result, String> { + let Some(items) = mapping + .get(Value::String("assets".to_owned())) + .and_then(Value::as_sequence) + else { + return Ok(Vec::new()); + }; + let mut assets = Vec::new(); + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!( + "{key}[{interpretation_idx}].assets[{idx}] must be a mapping" + )); + }; + assets.push(PanelInterpretationAsset { + id: mapping_required_string( + mapping, + "id", + idx, + &format!("{key}[{interpretation_idx}].assets"), + )?, + path: mapping_required_string( + mapping, + "path", + idx, + &format!("{key}[{interpretation_idx}].assets"), + )?, + }); + } + Ok(assets) +} + fn parse_panel_interpretation_logic( mapping: &Mapping, ) -> Result, String> { diff --git a/rust/bioscript-schema/src/validator_roots.rs b/rust/bioscript-schema/src/validator_roots.rs index 9600eb7..c9d6f73 100644 --- a/rust/bioscript-schema/src/validator_roots.rs +++ b/rust/bioscript-schema/src/validator_roots.rs @@ -34,7 +34,7 @@ fn validate_panel_root(root: &Value, issues: &mut Vec) { validate_tags(root, issues); validate_permissions(root, issues); validate_downloads(root, issues); - validate_panel_members(root, &["variant", "assay"], issues); + validate_panel_members(root, &["variant", "variant-catalogue", "assay"], issues); validate_panel_interpretations(root, issues); validate_findings(root, issues); } @@ -43,7 +43,7 @@ fn validate_assay_root(root: &Value, issues: &mut Vec) { validate_schema_and_identity(root, "bioscript:assay:1.0", None, issues); validate_optional_strings(root, &["name", "label", "summary"], issues); validate_tags(root, issues); - validate_panel_members(root, &["variant"], issues); + validate_panel_members(root, &["variant", "variant-catalogue"], issues); validate_panel_interpretations(root, issues); validate_findings(root, issues); } @@ -331,4 +331,3 @@ fn validate_coordinate_range_values(start: i64, end: i64, assembly: &str, issues }); } } - diff --git a/rust/bioscript-schema/src/validator_types.rs b/rust/bioscript-schema/src/validator_types.rs index a9466eb..a96a5a8 100644 --- a/rust/bioscript-schema/src/validator_types.rs +++ b/rust/bioscript-schema/src/validator_types.rs @@ -157,10 +157,17 @@ pub struct PanelInterpretation { pub path: String, pub output_format: Option, pub derived_from: Vec, + pub assets: Vec, pub emits: Vec, pub logic: Option, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationAsset { + pub id: String, + pub path: String, +} + #[derive(Debug, Clone, PartialEq, Eq)] pub struct PanelInterpretationLogic { pub source: Option, diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index 82a0fa0..3733cda 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -106,6 +106,73 @@ provenance: assert_eq!(report.total_warnings(), 0); } +#[test] +fn validate_variants_accepts_variant_catalogue_shape() { + let dir = temp_dir("validate-catalogue"); + let fixture = dir.join("variants.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:variant-catalogue:1.0" +version: "1.0" +name: "thalassemia-variants" +variants: + source: "variants.tsv" + format: "tsv" + key: "variant_id" +findings: + source: "findings.tsv" + format: "tsv" + key: "variant_id" +provenance: + sources: + - id: "ithagenes" + kind: "database" + label: "IthaGenes" + url: "https://www.ithanet.eu/db/ithagenes?action=list" + - id: "dbsnp" + kind: "database" + label: "dbSNP" + url_template: "https://www.ncbi.nlm.nih.gov/snp/{rsid}" +"#, + ) + .unwrap(); + + let report = validate_variants_path(&fixture).unwrap(); + assert_eq!(report.total_errors(), 0); + assert_eq!(report.total_warnings(), 0); +} + +#[test] +fn validate_variants_reports_variant_catalogue_shape_issues() { + let dir = temp_dir("validate-catalogue-errors"); + let fixture = dir.join("variants.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:variant-catalogue:1.0" +version: "1.0" +name: "broken-catalogue" +variants: + source: "variants.csv" + format: "csv" +provenance: + sources: + - id: "dbsnp" + kind: "database" + label: "dbSNP" +"#, + ) + .unwrap(); + + let report = validate_variants_path(&fixture).unwrap(); + let text = report.render_text(); + assert_eq!(report.total_errors(), 2); + assert_eq!(report.total_warnings(), 1); + assert!(text.contains("expected 'tsv'")); + assert!(text.contains("expected url or url_template")); +} + #[test] fn load_variant_manifest_text_accepts_start_end_coordinates() { let manifest = load_variant_manifest_text( @@ -260,6 +327,11 @@ interpretations: path: "apol1.py" derived_from: - "g1-site-1.yaml" + assets: + - id: "variants" + path: "variants.tsv" + - id: "findings" + path: "findings.tsv" emits: - key: "apol1_status" label: "APOL1 status" diff --git a/rust/bioscript-wasm/src/report_workspace/analysis.rs b/rust/bioscript-wasm/src/report_workspace/analysis.rs index c67ae08..e63f248 100644 --- a/rust/bioscript-wasm/src/report_workspace/analysis.rs +++ b/rust/bioscript-wasm/src/report_workspace/analysis.rs @@ -96,6 +96,7 @@ impl PackageWorkspace { RuntimeConfig { limits, loader: loader.clone(), + context: BTreeMap::new(), virtual_binary_files, virtual_text_files: std::mem::take(&mut virtual_text_files), preloaded_observations: preloaded_observations.to_vec(), From 1ba3f8768c8b7e2fe9915e639e63daebbb704401 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 14 May 2026 15:29:48 +1000 Subject: [PATCH 2/5] Adding initial variant catalogue plus controlled file i/o --- docs/assay-schema.md | 17 +- docs/variant-catalogue.md | 17 +- rust/bioscript-cli/src/manifest_runner.rs | 65 ++++ rust/bioscript-cli/src/report_execution.rs | 188 +++++++++-- rust/bioscript-cli/src/report_observations.rs | 19 +- rust/bioscript-cli/tests/cli.rs | 156 ++++++++- rust/bioscript-reporting/src/lib.rs | 7 +- rust/bioscript-reporting/src/manifest.rs | 121 +++---- .../src/manifest_catalogue.rs | 302 ++++++++++++++++++ .../src/manifest_members.rs | 91 ++++++ .../tests/validate_variants.rs | 3 + rust/bioscript-wasm/src/report_api.rs | 2 +- rust/bioscript-wasm/src/report_workspace.rs | 25 +- .../src/report_workspace/analysis.rs | 132 +++++++- 14 files changed, 982 insertions(+), 163 deletions(-) create mode 100644 rust/bioscript-reporting/src/manifest_catalogue.rs create mode 100644 rust/bioscript-reporting/src/manifest_members.rs diff --git a/docs/assay-schema.md b/docs/assay-schema.md index 3deec23..57dca64 100644 --- a/docs/assay-schema.md +++ b/docs/assay-schema.md @@ -88,19 +88,24 @@ Rules: - `kind` is currently `bioscript` - `path` points to a BioScript-compatible Python file - `output_format` is optional and defaults to `tsv`; use `json` or `jsonl` when the script writes structured JSON output -- `derived_from` lists the variant YAML files used by the interpretation -- `assets` is optional and lists local files the analysis script can read through the injected `asset_paths` dict +- `derived_from` lists the variant YAML or variant catalogue files used by the interpretation +- `assets` is optional and lists local files the analysis script can read through `bioscript.context["assets"]`; `asset_paths` is also injected during migration - `emits` is optional but recommended so report generators know which output columns to display and how to label them - `logic` is optional; use `logic.description` and `logic.source.url` to document where the script's derivation rules came from - Analysis rows may emit `notes` or `report_notes` as a reporting convention. HTML reports render those notes below the analysis table and omit them from the table columns; this avoids a manifest-level template language while still letting the script build human-readable text from computed values. Analysis scripts receive these injected variables: -- `input_file`: input genotype/VCF/CRAM path -- `output_file`: path the script must write +- `input_file`: virtual input genotype/VCF/CRAM path, normally `/input/genotypes` +- `output_file`: virtual output path the script must write, normally under `/output` - `participant_id`: current participant/sample identifier -- `observations_file`: TSV containing the variant observations already gathered from assay/panel members for this participant -- `asset_paths`: dict from each `assets[].id` to the resolved local path +- `observations_file`: virtual TSV path containing the variant observations already gathered from assay/panel members for this participant +- `asset_paths`: dict from each `assets[].id` to its virtual path + +The preferred API is `bioscript.context`, which includes `participant_id`, +`input_files`, `pipeline_files`, `assets`, `observations_file`, and +`output_file`. Scripts should use `bioscript.read_tsv(path)` for TSV assets and +observation files. This supports large catalogues where Rust/BioScript first gathers all variant observations, then Python joins those observations to attached TSV assets such as `findings.tsv`, `conditions.tsv`, or `rules.tsv` and emits derived classification rows. diff --git a/docs/variant-catalogue.md b/docs/variant-catalogue.md index ec133fd..e250e3c 100644 --- a/docs/variant-catalogue.md +++ b/docs/variant-catalogue.md @@ -6,9 +6,9 @@ curated source stays as plain text TSV plus a small YAML manifest. Build tools can still expand the catalogue into normal `bioscript:variant:1.0` YAML files when a package or older runtime path needs per-variant manifests. -This is a design note for the intended BioScript flow. The catalogue manifest -schema already exists as `bioscript:variant-catalogue:1.0`; direct assay member -support for catalogues is the next BioScript schema/runtime change. +This is the BioScript flow for TSV-backed variant catalogues. The catalogue +manifest schema is `bioscript:variant-catalogue:1.0`, and assay/panel members +can point at catalogue manifests directly. ## Why This Exists @@ -50,11 +50,9 @@ members: version: "1.0" ``` -Today, `bioscript:assay:1.0` assay members accept `kind: "variant"` only, so the -direct member above is the required next schema change. The member should point -to the catalogue YAML manifest, not directly to `variants.tsv`, because the YAML -manifest holds schema identity, version, table locations, keys, and shared -provenance. +The member should point to the catalogue YAML manifest, not directly to +`variants.tsv`, because the YAML manifest holds schema identity, version, table +locations, keys, and shared provenance. The catalogue manifest shape is: @@ -370,9 +368,6 @@ files work, but the real thalassemia logic needs proper allele, phase, gene, and evidence handling. ```python -import bioscript - - observations = bioscript.read_tsv(bioscript.context["observations_file"]) variants = bioscript.read_tsv(bioscript.context["assets"]["variants"]) findings = bioscript.read_tsv(bioscript.context["assets"]["findings"]) diff --git a/rust/bioscript-cli/src/manifest_runner.rs b/rust/bioscript-cli/src/manifest_runner.rs index eddac35..9786e29 100644 --- a/rust/bioscript-cli/src/manifest_runner.rs +++ b/rust/bioscript-cli/src/manifest_runner.rs @@ -43,6 +43,43 @@ fn run_manifest( )?; Ok(()) } + bioscript_reporting::ReportManifestKind::VariantCatalogue => { + let input_file = resolved_input + .as_deref() + .ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), options.loader) + .map_err(|err| err.to_string())?; + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(runtime_root); + let tasks = bioscript_reporting::collect_variant_manifest_tasks( + &workspace, + &manifest_path.display().to_string(), + options.filters, + )?; + let observations = store + .lookup_variants( + &tasks + .iter() + .map(|task| task.manifest.spec.clone()) + .collect::>(), + ) + .map_err(|err| err.to_string())?; + let rows = tasks + .into_iter() + .zip(observations) + .map(|(task, observation)| { + variant_row( + runtime_root, + Path::new(&task.manifest_path), + &task.manifest.name, + &task.manifest.tags, + &observation, + options.participant_id, + ) + }) + .collect::>(); + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } bioscript_reporting::ReportManifestKind::Panel => { let manifest = load_panel_manifest(manifest_path)?; let rows = run_panel_manifest( @@ -138,6 +175,22 @@ fn run_panel_manifest_with_store( } variant_entries.push((member_index, resolved, manifest)); } + bioscript_reporting::ExecutablePanelMember::VariantCatalogue(path) => { + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(runtime_root); + let tasks = bioscript_reporting::collect_variant_manifest_tasks( + &workspace, + &resolved.display().to_string(), + filters, + )?; + for task in tasks { + variant_entries.push(( + member_index, + PathBuf::from(&task.manifest_path), + task.manifest, + )); + } + } bioscript_reporting::ExecutablePanelMember::Assay(path) => { let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; let assay = load_assay_manifest(&resolved)?; @@ -211,6 +264,18 @@ fn run_assay_manifest_with_store( } entries.push((resolved, manifest)); } + bioscript_reporting::ExecutableAssayMember::VariantCatalogue(path) => { + let resolved = resolve_manifest_path(runtime_root, &assay.path, path)?; + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(runtime_root); + let tasks = bioscript_reporting::collect_variant_manifest_tasks( + &workspace, + &resolved.display().to_string(), + filters, + )?; + for task in tasks { + entries.push((PathBuf::from(&task.manifest_path), task.manifest)); + } + } } } diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs index 227b257..293b410 100644 --- a/rust/bioscript-cli/src/report_execution.rs +++ b/rust/bioscript-cli/src/report_execution.rs @@ -114,6 +114,8 @@ fn run_interpretations_for_report( })?; run_bioscript_analysis_script(&BioscriptAnalysisScriptInput { runtime_root: options.runtime_root, + manifest_path, + interpretation, script_path: &script_path, input_file: options.input_file, output_file: &output_file, @@ -163,6 +165,8 @@ fn run_interpretations_for_report( struct BioscriptAnalysisScriptInput<'a> { runtime_root: &'a Path, + manifest_path: &'a Path, + interpretation: &'a PanelInterpretation, script_path: &'a Path, input_file: &'a Path, output_file: &'a Path, @@ -179,56 +183,182 @@ fn run_bioscript_analysis_script(input: &BioscriptAnalysisScriptInput<'_>) -> Re .max_allocations(400_000) .gc_interval(1000) .max_recursion_depth(Some(200)); + let input_bytes = fs::read(input.input_file) + .map_err(|err| format!("failed to read analysis input {}: {err}", input.input_file.display()))?; + let observations_text = fs::read_to_string(input.observations_file).map_err(|err| { + format!( + "failed to read analysis observations {}: {err}", + input.observations_file.display() + ) + })?; + let script_text = fs::read_to_string(input.script_path) + .map_err(|err| format!("failed to read analysis script {}: {err}", input.script_path.display()))?; + let virtual_input_file = "/input/genotypes".to_owned(); + let virtual_observations_file = "/work/observations.tsv".to_owned(); + let output_extension = input + .output_file + .extension() + .and_then(|value| value.to_str()) + .unwrap_or("tsv"); + let virtual_output_file = format!("/output/results.{output_extension}"); + let script_virtual_path = virtual_pipeline_path(input.script_path, "analysis.py"); + let manifest_virtual_path = virtual_pipeline_path(input.manifest_path, "manifest.yaml"); + let mut virtual_text_files = BTreeMap::new(); + virtual_text_files.insert(script_virtual_path.clone(), script_text); + virtual_text_files.insert( + manifest_virtual_path.clone(), + fs::read_to_string(input.manifest_path).map_err(|err| { + format!( + "failed to read analysis manifest {}: {err}", + input.manifest_path.display() + ) + })?, + ); + virtual_text_files.insert(virtual_observations_file.clone(), observations_text); + let mut asset_paths = BTreeMap::new(); + for asset in &input.interpretation.assets { + let asset_path = resolve_manifest_path(input.runtime_root, input.manifest_path, &asset.path)?; + let virtual_asset_path = virtual_pipeline_path(&asset_path, &asset.path); + let text = fs::read_to_string(&asset_path) + .map_err(|err| format!("failed to read analysis asset {}: {err}", asset_path.display()))?; + virtual_text_files.insert(virtual_asset_path.clone(), text); + asset_paths.insert(asset.id.clone(), virtual_asset_path); + } + let mut virtual_binary_files = BTreeMap::new(); + virtual_binary_files.insert(virtual_input_file.clone(), input_bytes); + let context = analysis_context( + input.participant_id, + &virtual_input_file, + &script_virtual_path, + &manifest_virtual_path, + &asset_paths, + &virtual_observations_file, + &virtual_output_file, + ); let runtime = BioscriptRuntime::with_config( input.runtime_root.to_path_buf(), RuntimeConfig { limits, loader: input.loader.clone(), + context, + virtual_binary_files, + virtual_text_files, ..RuntimeConfig::default() }, ) .map_err(|err| err.to_string())?; runtime .run_file( - input.script_path, + &script_virtual_path, None, vec![ - ( - "input_file", - monty::MontyObject::String(runtime_path_string( - input.runtime_root, - input.input_file, - )), - ), - ( - "output_file", - monty::MontyObject::String(runtime_path_string( - input.runtime_root, - input.output_file, - )), - ), - ( - "observations_file", - monty::MontyObject::String(runtime_path_string( - input.runtime_root, - input.observations_file, - )), - ), + ("input_file", monty::MontyObject::String(virtual_input_file)), + ("output_file", monty::MontyObject::String(virtual_output_file.clone())), + ("observations_file", monty::MontyObject::String(virtual_observations_file)), + ("asset_paths", monty_string_dict(&asset_paths)), ( "participant_id", monty::MontyObject::String(input.participant_id.to_owned()), ), ], ) - .map(|_| ()) - .map_err(|err| err.to_string()) + .map_err(|err| err.to_string())?; + let written = runtime.virtual_written_text_files(); + let output_text = written + .get(&virtual_output_file) + .ok_or_else(|| format!("analysis did not write {virtual_output_file}"))?; + if let Some(parent) = input.output_file.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create analysis output dir {}: {err}", parent.display()))?; + } + fs::write(input.output_file, output_text).map_err(|err| { + format!( + "failed to write analysis output {}: {err}", + input.output_file.display() + ) + })?; + persist_virtual_output_files(&written, &virtual_output_file, input.output_file) } -fn runtime_path_string(runtime_root: &Path, path: &Path) -> String { - path.strip_prefix(runtime_root) - .unwrap_or(path) - .display() - .to_string() +fn persist_virtual_output_files( + written: &BTreeMap, + primary_virtual_output_file: &str, + primary_output_file: &Path, +) -> Result<(), String> { + let Some(output_dir) = primary_output_file.parent() else { + return Ok(()); + }; + for (virtual_path, text) in written { + if virtual_path == primary_virtual_output_file { + continue; + } + let Some(relative) = virtual_path.strip_prefix("/output/") else { + continue; + }; + let relative_path = Path::new(relative); + if relative_path + .components() + .any(|component| !matches!(component, std::path::Component::Normal(_))) + { + return Err(format!("analysis wrote invalid output path {virtual_path}")); + } + let output_path = output_dir.join(relative_path); + if let Some(parent) = output_path.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create output dir {}: {err}", parent.display()))?; + } + fs::write(&output_path, text) + .map_err(|err| format!("failed to write analysis output {}: {err}", output_path.display()))?; + } + Ok(()) +} + +fn virtual_pipeline_path(path: &Path, fallback: &str) -> String { + let name = path + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or(fallback); + format!("/input/pipeline/{name}") +} + +fn analysis_context( + participant_id: &str, + input_file: &str, + script_path: &str, + manifest_path: &str, + asset_paths: &BTreeMap, + observations_file: &str, + output_file: &str, +) -> BTreeMap { + BTreeMap::from([ + ( + "participant_id".to_owned(), + monty::MontyObject::String(participant_id.to_owned()), + ), + ( + "input_files".to_owned(), + monty_string_dict(&BTreeMap::from([( + "genotypes".to_owned(), + input_file.to_owned(), + )])), + ), + ( + "pipeline_files".to_owned(), + monty_string_dict(&BTreeMap::from([ + ("manifest".to_owned(), manifest_path.to_owned()), + ("analysis".to_owned(), script_path.to_owned()), + ])), + ), + ("assets".to_owned(), monty_string_dict(asset_paths)), + ( + "observations_file".to_owned(), + monty::MontyObject::String(observations_file.to_owned()), + ), + ( + "output_file".to_owned(), + monty::MontyObject::String(output_file.to_owned()), + ), + ]) } fn parse_analysis_output( diff --git a/rust/bioscript-cli/src/report_observations.rs b/rust/bioscript-cli/src/report_observations.rs index 261dada..eb11db3 100644 --- a/rust/bioscript-cli/src/report_observations.rs +++ b/rust/bioscript-cli/src/report_observations.rs @@ -11,10 +11,21 @@ fn app_observation_from_manifest_row( } else { runtime_root.join(&row_path) }; - let manifest = load_variant_manifest(&manifest_path)?; - let gene = variant_manifest_gene(&manifest_path)?; - let observed_alt_alleles = variant_observed_alt_alleles(&manifest_path)?; - let source = variant_primary_source(&manifest_path)?; + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(runtime_root); + let task = bioscript_reporting::load_variant_manifest_task_by_path( + &workspace, + &manifest_path.display().to_string(), + )?; + let manifest = task.manifest; + let (gene, observed_alt_alleles, source) = if row_path.contains('#') { + (String::new(), Vec::new(), serde_json::Value::Null) + } else { + ( + variant_manifest_gene(&manifest_path)?, + variant_observed_alt_alleles(&manifest_path)?, + variant_primary_source(&manifest_path)?, + ) + }; Ok(bioscript_reporting::app_observation_from_manifest_row( bioscript_reporting::AppObservationInput { row, diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 17613ed..c2215c5 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -1,6 +1,6 @@ use std::{ fs, - path::PathBuf, + path::{Path, PathBuf}, process::Command, time::{SystemTime, UNIX_EPOCH}, }; @@ -379,3 +379,157 @@ interpretations: assert!(stdout.contains("example-rs73885319")); assert!(stdout.contains("AG")); } + +#[test] +fn report_runs_variant_catalogue_analysis_with_sandboxed_assets() { + let root = repo_root(); + let dir = temp_dir("catalogue-report"); + let (assay, output_dir) = write_catalogue_report_fixture(&dir); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("report") + .arg(&assay) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--output-dir") + .arg(&output_dir) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let participant = "test_snps"; + let analysis_tsv = + fs::read_to_string(output_dir.join(format!("analysis/{participant}/catalogue_status.tsv"))) + .unwrap(); + assert!(analysis_tsv.contains("catalogue_assets_loaded")); + let observations = fs::read_to_string(output_dir.join("observations.tsv")).unwrap(); + assert!(observations.contains("example-rs73885319")); + assert!(observations.contains("AG")); +} + +fn write_catalogue_report_fixture(dir: &Path) -> (PathBuf, PathBuf) { + let catalogue_dir = dir.join("catalogue"); + fs::create_dir_all(&catalogue_dir).unwrap(); + write_catalogue_manifest(&catalogue_dir); + write_catalogue_tables(&catalogue_dir); + write_catalogue_analysis(dir); + let assay = write_catalogue_assay(dir); + (assay, dir.join("report")) +} + +fn write_catalogue_manifest(catalogue_dir: &Path) { + fs::write( + catalogue_dir.join("variants.yaml"), + r#" +schema: "bioscript:variant-catalogue:1.0" +version: "1.0" +name: "apol1-catalogue" +variants: + source: "variants.tsv" + format: "tsv" + key: "variant_id" + columns: + variant_id: + role: "id" + required: true + name: + role: "name" + rsid: + role: "identifier.rsid" + kind: + role: "alleles.kind" + ref: + role: "alleles.ref" + alts: + role: "alleles.alts" + grch38_chrom: + role: "coordinates.grch38.chrom" + grch38_pos: + role: "coordinates.grch38.pos" + type: "integer" +provenance: + sources: + - id: "dbsnp" + kind: "database" + label: "dbSNP" + url_template: "https://www.ncbi.nlm.nih.gov/snp/{rsid}" +"#, + ) + .unwrap(); +} + +fn write_catalogue_tables(catalogue_dir: &Path) { + fs::write( + catalogue_dir.join("variants.tsv"), + "variant_id\tname\trsid\tkind\tref\talts\tgrch38_chrom\tgrch38_pos\napol1-g1-site-1\texample-rs73885319\trs73885319\tsnv\tA\tG\t22\t36265860\n", + ) + .unwrap(); + fs::write( + catalogue_dir.join("findings.tsv"), + "variant_id\tlabel\napol1-g1-site-1\tcatalogue_assets_loaded\n", + ) + .unwrap(); +} + +fn write_catalogue_analysis(dir: &Path) { + fs::write( + dir.join("analysis.py"), + r#" +observations = bioscript.read_tsv(bioscript.context["observations_file"]) +variants = bioscript.read_tsv(bioscript.context["assets"]["variants"]) +findings = bioscript.read_tsv(bioscript.context["assets"]["findings"]) + +status = "missing" +if len(observations) > 0 and len(variants) > 0 and len(findings) > 0: + status = findings[0]["label"] + +bioscript.write_tsv(bioscript.context["output_file"], [ + { + "participant_id": bioscript.context["participant_id"], + "status": status, + } +]) +"#, + ) + .unwrap(); +} + +fn write_catalogue_assay(dir: &Path) -> PathBuf { + let assay = dir.join("assay.yaml"); + fs::write( + &assay, + r#" +schema: "bioscript:assay:1.0" +version: "1.0" +name: "catalogue-assay" +members: + - kind: "variant-catalogue" + path: "catalogue/variants.yaml" + version: "1.0" +analyses: + - id: "catalogue_status" + kind: "bioscript" + path: "analysis.py" + output_format: "tsv" + derived_from: + - "catalogue/variants.yaml" + assets: + - id: "variants" + path: "catalogue/variants.tsv" + - id: "findings" + path: "catalogue/findings.tsv" + emits: + - key: "status" + label: "Status" + value_type: "string" + format: "badge" +"#, + ) + .unwrap(); + assay +} diff --git a/rust/bioscript-reporting/src/lib.rs b/rust/bioscript-reporting/src/lib.rs index 45d2f47..ea61414 100644 --- a/rust/bioscript-reporting/src/lib.rs +++ b/rust/bioscript-reporting/src/lib.rs @@ -26,9 +26,10 @@ pub use manifest::{ VariantManifestTask, assay_executable_member, assay_executable_member_path, collect_analysis_manifest_tasks, collect_manifest_provenance_entries, collect_variant_manifest_tasks, load_manifest_findings, load_manifest_provenance_links, - load_report_manifest_context, matches_analysis_path_filters, matches_variant_manifest_filters, - panel_executable_member, panel_executable_member_path, report_assay_id, report_manifest_kind, - report_manifest_metadata, report_manifest_schema, resolve_filesystem_manifest_path, + load_report_manifest_context, load_variant_manifest_task_by_path, + matches_analysis_path_filters, matches_variant_manifest_filters, panel_executable_member, + panel_executable_member_path, report_assay_id, report_manifest_kind, report_manifest_metadata, + report_manifest_schema, resolve_filesystem_manifest_path, }; pub use matching::match_app_findings; pub use observation::{AppObservationInput, app_observation_from_manifest_row}; diff --git a/rust/bioscript-reporting/src/manifest.rs b/rust/bioscript-reporting/src/manifest.rs index 06e0645..6d30287 100644 --- a/rust/bioscript-reporting/src/manifest.rs +++ b/rust/bioscript-reporting/src/manifest.rs @@ -8,14 +8,25 @@ use bioscript_schema::{ load_variant_manifest_text, }; +#[path = "manifest_catalogue.rs"] +mod catalogue; +#[path = "manifest_members.rs"] +mod members; #[path = "manifest_provenance.rs"] mod provenance; +use catalogue::load_variant_catalogue_tasks; +pub(crate) use members::traversable_manifest_member_paths; +pub use members::{ + ExecutableAssayMember, ExecutablePanelMember, assay_executable_member, + assay_executable_member_path, panel_executable_member, panel_executable_member_path, +}; pub use provenance::{collect_manifest_provenance_entries, load_manifest_provenance_links}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ReportManifestKind { Variant, + VariantCatalogue, Panel, Assay, } @@ -23,6 +34,7 @@ pub enum ReportManifestKind { pub fn report_manifest_kind(schema: &str) -> Result { match schema { "bioscript:variant:1.0" | "bioscript:variant" => Ok(ReportManifestKind::Variant), + "bioscript:variant-catalogue:1.0" => Ok(ReportManifestKind::VariantCatalogue), "bioscript:panel:1.0" => Ok(ReportManifestKind::Panel), "bioscript:assay:1.0" => Ok(ReportManifestKind::Assay), other => Err(format!("unsupported manifest schema '{other}'")), @@ -220,7 +232,7 @@ pub fn collect_analysis_manifest_tasks( interpretations: manifest.interpretations, }]) } - ReportManifestKind::Variant => Ok(Vec::new()), + ReportManifestKind::Variant | ReportManifestKind::VariantCatalogue => Ok(Vec::new()), } } @@ -244,6 +256,9 @@ pub fn collect_variant_manifest_tasks( manifest, }]) } + ReportManifestKind::VariantCatalogue => { + load_variant_catalogue_tasks(workspace, path, filters) + } ReportManifestKind::Panel => { let text = workspace.load_text(path)?; let manifest = load_panel_manifest_text(path, &text)?; @@ -257,6 +272,10 @@ pub fn collect_variant_manifest_tasks( tasks.push(variant); } } + ExecutablePanelMember::VariantCatalogue(member_path) => { + let resolved = workspace.resolve(path, member_path)?; + tasks.extend(load_variant_catalogue_tasks(workspace, &resolved, filters)?); + } ExecutablePanelMember::Assay(member_path) => { let resolved = workspace.resolve(path, member_path)?; tasks.extend(collect_variant_manifest_tasks( @@ -280,6 +299,10 @@ pub fn collect_variant_manifest_tasks( tasks.push(variant); } } + ExecutableAssayMember::VariantCatalogue(member_path) => { + let resolved = workspace.resolve(path, member_path)?; + tasks.extend(load_variant_catalogue_tasks(workspace, &resolved, filters)?); + } } } Ok(tasks) @@ -299,6 +322,19 @@ fn load_variant_task( }) } +pub fn load_variant_manifest_task_by_path( + workspace: &impl ManifestWorkspace, + path: &str, +) -> Result { + if let Some((catalogue_path, variant_id)) = path.rsplit_once('#') { + return load_variant_catalogue_tasks(workspace, catalogue_path, &[])? + .into_iter() + .find(|task| task.manifest_path == path || task.manifest.name == variant_id) + .ok_or_else(|| format!("variant catalogue entry not found: {path}")); + } + load_variant_task(workspace, path) +} + pub fn load_manifest_findings( workspace: &impl ManifestWorkspace, path: &str, @@ -372,61 +408,6 @@ pub fn matches_analysis_path_filters(path: &str, filters: &[String]) -> bool { }) } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ExecutablePanelMember<'a> { - Variant(&'a str), - Assay(&'a str), -} - -pub fn panel_executable_member<'a>( - kind: &str, - path: Option<&'a str>, -) -> Result, String> { - let path = panel_executable_member_path(kind, path)?; - match kind { - "variant" => Ok(ExecutablePanelMember::Variant(path)), - "assay" => Ok(ExecutablePanelMember::Assay(path)), - _ => unreachable!("panel_executable_member_path validates member kind"), - } -} - -pub fn panel_executable_member_path<'a>( - kind: &str, - path: Option<&'a str>, -) -> Result<&'a str, String> { - let Some(path) = path else { - return Err("remote panel members are not executable yet".to_owned()); - }; - if !matches!(kind, "variant" | "assay") { - return Err(format!("panel member kind '{kind}' is not executable")); - } - Ok(path) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ExecutableAssayMember<'a> { - Variant(&'a str), -} -pub fn assay_executable_member<'a>( - kind: &str, - path: Option<&'a str>, -) -> Result, String> { - let path = assay_executable_member_path(kind, path)?; - Ok(ExecutableAssayMember::Variant(path)) -} -pub fn assay_executable_member_path<'a>( - kind: &str, - path: Option<&'a str>, -) -> Result<&'a str, String> { - if kind != "variant" { - return Err(format!("assay member kind '{kind}' is not executable")); - } - let Some(path) = path else { - return Err("remote assay members are not executable yet".to_owned()); - }; - Ok(path) -} - pub(super) fn manifest_supports_findings(schema: &str) -> bool { matches!( schema, @@ -438,34 +419,6 @@ pub(super) fn manifest_supports_findings(schema: &str) -> bool { ) } -fn manifest_has_traversable_members(schema: &str) -> bool { - matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") -} - -pub(super) fn traversable_manifest_member_paths<'a>( - schema: &str, - value: &'a serde_yaml::Value, -) -> Vec<&'a str> { - if !manifest_has_traversable_members(schema) { - return Vec::new(); - } - value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - .into_iter() - .flatten() - .filter_map(traversable_manifest_member_path) - .collect() -} - -fn traversable_manifest_member_path(member: &serde_yaml::Value) -> Option<&str> { - let kind = member.get("kind").and_then(serde_yaml::Value::as_str)?; - if !matches!(kind, "variant" | "assay") { - return None; - } - member.get("path").and_then(serde_yaml::Value::as_str) -} - pub(super) fn yaml_to_json(value: serde_yaml::Value) -> Result { serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) } diff --git a/rust/bioscript-reporting/src/manifest_catalogue.rs b/rust/bioscript-reporting/src/manifest_catalogue.rs new file mode 100644 index 0000000..20666bb --- /dev/null +++ b/rust/bioscript-reporting/src/manifest_catalogue.rs @@ -0,0 +1,302 @@ +use std::{collections::BTreeMap, path::PathBuf}; + +use bioscript_core::{GenomicLocus, VariantKind, VariantSpec}; +use bioscript_schema::VariantManifest; + +use super::{ + ManifestWorkspace, VariantManifestTask, matches_variant_manifest_filters, yaml_string, + yaml_string_sequence, +}; + +pub(super) fn load_variant_catalogue_tasks( + workspace: &impl ManifestWorkspace, + path: &str, + filters: &[String], +) -> Result, String> { + let value = workspace.load_yaml(path)?; + let variants = value + .get("variants") + .and_then(serde_yaml::Value::as_mapping) + .ok_or_else(|| format!("{path} is missing variants table declaration"))?; + let source = variants + .get(serde_yaml::Value::String("source".to_owned())) + .and_then(serde_yaml::Value::as_str) + .ok_or_else(|| format!("{path} variants.source missing"))?; + let source_path = workspace.resolve(path, source)?; + let table_text = workspace.load_text(&source_path)?; + let rows = parse_tsv_rows(&table_text)?; + let columns = CatalogueColumns::from_manifest(variants); + let catalogue_name = + yaml_string(&value, "name").unwrap_or_else(|| "variant-catalogue".to_owned()); + let catalogue_tags = yaml_string_sequence(&value, "tags") + .into_iter() + .filter_map(|value| value.as_str().map(ToOwned::to_owned)) + .collect::>(); + + rows.into_iter() + .enumerate() + .map(|(idx, row)| { + catalogue_row_task(path, &catalogue_name, &catalogue_tags, &columns, idx, &row) + }) + .filter_map(|result| match result { + Ok(task) + if matches_variant_manifest_filters( + &task.manifest, + &task.manifest_path, + filters, + ) => + { + Some(Ok(task)) + } + Ok(_) => None, + Err(err) => Some(Err(err)), + }) + .collect() +} + +fn catalogue_row_task( + catalogue_path: &str, + catalogue_name: &str, + catalogue_tags: &[String], + columns: &CatalogueColumns, + idx: usize, + row: &BTreeMap, +) -> Result { + let variant_id = columns + .value(row, "id") + .or_else(|| columns.value(row, "identifier.rsid")) + .filter(|value| !value.is_empty()) + .ok_or_else(|| { + format!( + "{catalogue_path} variants.tsv row {} missing variant id", + idx + 2 + ) + })?; + let name = columns + .value(row, "name") + .filter(|value| !value.is_empty()) + .map_or_else( + || format!("{catalogue_name}-{variant_id}"), + ToOwned::to_owned, + ); + let mut rsids = split_list( + columns.value(row, "identifier.rsid"), + columns.separator("identifier.rsid"), + ); + rsids.extend(split_list( + columns.value(row, "identifier.aliases"), + columns.separator("identifier.aliases"), + )); + rsids.sort(); + rsids.dedup(); + let spec = VariantSpec { + rsids, + grch37: catalogue_locus(columns, row, "grch37")?, + grch38: catalogue_locus(columns, row, "grch38")?, + reference: columns.value(row, "alleles.ref").map(ToOwned::to_owned), + alternate: split_list( + columns.value(row, "alleles.alts"), + columns.separator("alleles.alts"), + ) + .into_iter() + .next(), + kind: columns + .value(row, "alleles.kind") + .map(catalogue_variant_kind), + ..VariantSpec::default() + }; + if !spec.has_rsids() && !spec.has_coordinates() { + return Err(format!( + "{catalogue_path} variants.tsv row {} has no rsid or coordinates", + idx + 2 + )); + } + let manifest = VariantManifest { + path: PathBuf::from(format!("{catalogue_path}#{variant_id}")), + name, + tags: catalogue_tags.to_vec(), + spec, + }; + Ok(VariantManifestTask { + manifest_path: manifest.path.display().to_string(), + manifest, + }) +} + +fn catalogue_locus( + columns: &CatalogueColumns, + row: &BTreeMap, + assembly: &str, +) -> Result, String> { + let chrom_role = format!("coordinates.{assembly}.chrom"); + let Some(chrom) = columns + .value(row, &chrom_role) + .filter(|value| !value.is_empty()) + else { + return Ok(None); + }; + let pos_role = format!("coordinates.{assembly}.pos"); + if let Some(pos) = columns + .value(row, &pos_role) + .filter(|value| !value.is_empty()) + { + let pos = parse_catalogue_i64(pos, &pos_role)?; + return Ok(Some(GenomicLocus { + chrom: chrom.to_owned(), + start: pos, + end: pos, + })); + } + let start_role = format!("coordinates.{assembly}.start"); + let end_role = format!("coordinates.{assembly}.end"); + match ( + columns + .value(row, &start_role) + .filter(|value| !value.is_empty()), + columns + .value(row, &end_role) + .filter(|value| !value.is_empty()), + ) { + (Some(start), Some(end)) => Ok(Some(GenomicLocus { + chrom: chrom.to_owned(), + start: parse_catalogue_i64(start, &start_role)?, + end: parse_catalogue_i64(end, &end_role)?, + })), + (None, None) => Ok(None), + _ => Err(format!("expected both {start_role} and {end_role}")), + } +} + +fn parse_catalogue_i64(value: &str, role: &str) -> Result { + value + .parse::() + .map_err(|err| format!("invalid integer for {role}: {value}: {err}")) +} + +fn catalogue_variant_kind(value: &str) -> VariantKind { + match value { + "snv" | "snp" => VariantKind::Snp, + "deletion" => VariantKind::Deletion, + "insertion" => VariantKind::Insertion, + "indel" => VariantKind::Indel, + _ => VariantKind::Other, + } +} + +fn split_list(value: Option<&str>, separator: Option<&str>) -> Vec { + let Some(value) = value else { + return Vec::new(); + }; + let separator = separator.unwrap_or("|"); + value + .split(separator) + .map(str::trim) + .filter(|item| !item.is_empty()) + .map(ToOwned::to_owned) + .collect() +} + +fn parse_tsv_rows(text: &str) -> Result>, String> { + let mut lines = text.lines().filter(|line| !line.trim().is_empty()); + let Some(header_line) = lines.next() else { + return Ok(Vec::new()); + }; + let headers = header_line + .split('\t') + .map(ToOwned::to_owned) + .collect::>(); + if headers.is_empty() || headers.iter().any(|header| header.trim().is_empty()) { + return Err("TSV header contains an empty column name".to_owned()); + } + Ok(lines + .map(|line| { + let cells = line.split('\t').collect::>(); + headers + .iter() + .enumerate() + .map(|(idx, header)| { + ( + header.clone(), + cells.get(idx).copied().unwrap_or_default().to_owned(), + ) + }) + .collect() + }) + .collect()) +} + +#[derive(Debug, Default)] +struct CatalogueColumns { + role_to_column: BTreeMap, + role_to_separator: BTreeMap, +} + +impl CatalogueColumns { + fn from_manifest(mapping: &serde_yaml::Mapping) -> Self { + let mut columns = Self::default(); + if let Some(column_mappings) = mapping + .get(serde_yaml::Value::String("columns".to_owned())) + .and_then(serde_yaml::Value::as_mapping) + { + for (column, config) in column_mappings { + let Some(column_name) = column.as_str() else { + continue; + }; + let Some(config) = config.as_mapping() else { + continue; + }; + let Some(role) = config + .get(serde_yaml::Value::String("role".to_owned())) + .and_then(serde_yaml::Value::as_str) + else { + continue; + }; + columns + .role_to_column + .insert(role.to_owned(), column_name.to_owned()); + if let Some(separator) = config + .get(serde_yaml::Value::String("list_separator".to_owned())) + .and_then(serde_yaml::Value::as_str) + { + columns + .role_to_separator + .insert(role.to_owned(), separator.to_owned()); + } + } + } + columns.insert_default("id", "variant_id"); + columns.insert_default("name", "name"); + columns.insert_default("gene", "gene"); + columns.insert_default("identifier.rsid", "rsid"); + columns.insert_default("identifier.aliases", "aliases"); + columns.insert_default("alleles.kind", "kind"); + columns.insert_default("alleles.ref", "ref"); + columns.insert_default("alleles.alts", "alts"); + columns.insert_default("coordinates.grch37.chrom", "grch37_chrom"); + columns.insert_default("coordinates.grch37.pos", "grch37_pos"); + columns.insert_default("coordinates.grch37.start", "grch37_start"); + columns.insert_default("coordinates.grch37.end", "grch37_end"); + columns.insert_default("coordinates.grch38.chrom", "grch38_chrom"); + columns.insert_default("coordinates.grch38.pos", "grch38_pos"); + columns.insert_default("coordinates.grch38.start", "grch38_start"); + columns.insert_default("coordinates.grch38.end", "grch38_end"); + columns + } + + fn insert_default(&mut self, role: &str, column: &str) { + self.role_to_column + .entry(role.to_owned()) + .or_insert_with(|| column.to_owned()); + } + + fn value<'a>(&self, row: &'a BTreeMap, role: &str) -> Option<&'a str> { + self.role_to_column + .get(role) + .and_then(|column| row.get(column)) + .map(String::as_str) + } + + fn separator(&self, role: &str) -> Option<&str> { + self.role_to_separator.get(role).map(String::as_str) + } +} diff --git a/rust/bioscript-reporting/src/manifest_members.rs b/rust/bioscript-reporting/src/manifest_members.rs new file mode 100644 index 0000000..149e711 --- /dev/null +++ b/rust/bioscript-reporting/src/manifest_members.rs @@ -0,0 +1,91 @@ +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExecutablePanelMember<'a> { + Variant(&'a str), + VariantCatalogue(&'a str), + Assay(&'a str), +} + +pub fn panel_executable_member<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result, String> { + let path = panel_executable_member_path(kind, path)?; + match kind { + "variant" => Ok(ExecutablePanelMember::Variant(path)), + "variant-catalogue" => Ok(ExecutablePanelMember::VariantCatalogue(path)), + "assay" => Ok(ExecutablePanelMember::Assay(path)), + _ => unreachable!("panel_executable_member_path validates member kind"), + } +} + +pub fn panel_executable_member_path<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result<&'a str, String> { + let Some(path) = path else { + return Err("remote panel members are not executable yet".to_owned()); + }; + if !matches!(kind, "variant" | "variant-catalogue" | "assay") { + return Err(format!("panel member kind '{kind}' is not executable")); + } + Ok(path) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExecutableAssayMember<'a> { + Variant(&'a str), + VariantCatalogue(&'a str), +} + +pub fn assay_executable_member<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result, String> { + let path = assay_executable_member_path(kind, path)?; + match kind { + "variant" => Ok(ExecutableAssayMember::Variant(path)), + "variant-catalogue" => Ok(ExecutableAssayMember::VariantCatalogue(path)), + _ => unreachable!("assay_executable_member_path validates member kind"), + } +} + +pub fn assay_executable_member_path<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result<&'a str, String> { + if !matches!(kind, "variant" | "variant-catalogue") { + return Err(format!("assay member kind '{kind}' is not executable")); + } + let Some(path) = path else { + return Err("remote assay members are not executable yet".to_owned()); + }; + Ok(path) +} + +pub(crate) fn traversable_manifest_member_paths<'a>( + schema: &str, + value: &'a serde_yaml::Value, +) -> Vec<&'a str> { + if !manifest_has_traversable_members(schema) { + return Vec::new(); + } + value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + .into_iter() + .flatten() + .filter_map(traversable_manifest_member_path) + .collect() +} + +fn manifest_has_traversable_members(schema: &str) -> bool { + matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") +} + +fn traversable_manifest_member_path(member: &serde_yaml::Value) -> Option<&str> { + let kind = member.get("kind").and_then(serde_yaml::Value::as_str)?; + if !matches!(kind, "variant" | "variant-catalogue" | "assay") { + return None; + } + member.get("path").and_then(serde_yaml::Value::as_str) +} diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index 3733cda..65030c2 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -321,6 +321,9 @@ members: - kind: "variant" path: "g1-site-1.yaml" version: "1.0" + - kind: "variant-catalogue" + path: "catalogue/variants.yaml" + version: "1.0" interpretations: - id: "apol1_status" kind: "bioscript" diff --git a/rust/bioscript-wasm/src/report_api.rs b/rust/bioscript-wasm/src/report_api.rs index 459c621..b797891 100644 --- a/rust/bioscript-wasm/src/report_api.rs +++ b/rust/bioscript-wasm/src/report_api.rs @@ -12,7 +12,7 @@ use bioscript_formats::{ inspect_bytes as inspect_bytes_rs, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; -use bioscript_schema::{PanelInterpretation, VariantManifest, load_variant_manifest_text}; +use bioscript_schema::PanelInterpretation; use monty::{MontyObject, ResourceLimits}; use serde::{Deserialize, Serialize}; use wasm_bindgen::prelude::*; diff --git a/rust/bioscript-wasm/src/report_workspace.rs b/rust/bioscript-wasm/src/report_workspace.rs index 85451a5..a42141c 100644 --- a/rust/bioscript-wasm/src/report_workspace.rs +++ b/rust/bioscript-wasm/src/report_workspace.rs @@ -65,11 +65,6 @@ impl PackageWorkspace { normalize_package_path(&base.join(relative).display().to_string()) } - pub(super) fn load_variant(&self, path: &str) -> Result { - load_variant_manifest_text(path, self.text(path)?) - .map_err(|err| JsError::new(&format!("load variant {path}: {err}"))) - } - pub(super) fn run_manifest_rows( &self, manifest_path: &str, @@ -114,9 +109,19 @@ impl PackageWorkspace { fallback_assembly: Option, ) -> Result { let row_path = row.get("path").cloned().unwrap_or_default(); - let manifest = self.load_variant(&row_path)?; - let value = self.yaml(&row_path)?; - let gene = yaml_string(&value, "gene").unwrap_or_default(); + let task = bioscript_reporting::load_variant_manifest_task_by_path(self, &row_path) + .map_err(|err| JsError::new(&err))?; + let manifest = task.manifest; + let (gene, source, observed_alt_alleles) = if row_path.contains('#') { + (String::new(), serde_json::Value::Null, Vec::new()) + } else { + let value = self.yaml(&row_path)?; + ( + yaml_string(&value, "gene").unwrap_or_default(), + variant_primary_source_from_yaml(&value), + variant_observed_alt_alleles_from_yaml(&value), + ) + }; Ok(bioscript_reporting::app_observation_from_manifest_row( bioscript_reporting::AppObservationInput { row, @@ -124,8 +129,8 @@ impl PackageWorkspace { assay_id, manifest, gene, - source: variant_primary_source_from_yaml(&value), - observed_alt_alleles: variant_observed_alt_alleles_from_yaml(&value), + source, + observed_alt_alleles, inferred_sex, fallback_assembly, }, diff --git a/rust/bioscript-wasm/src/report_workspace/analysis.rs b/rust/bioscript-wasm/src/report_workspace/analysis.rs index e63f248..bedca4b 100644 --- a/rust/bioscript-wasm/src/report_workspace/analysis.rs +++ b/rust/bioscript-wasm/src/report_workspace/analysis.rs @@ -41,7 +41,7 @@ impl PackageWorkspace { manifest_path: &str, manifest_name: &str, interpretations: &[PanelInterpretation], - input_name: &str, + _input_name: &str, input_bytes: &[u8], preloaded_observations: &[VariantObservation], participant_id: &str, @@ -72,19 +72,46 @@ impl PackageWorkspace { participant_id, &interpretation.id, ); - let observations_file = options + let _observations_file = options .output_dir .as_deref() .filter(|dir| !dir.is_empty()) .map(|dir| format!("{}/{}", dir.trim_end_matches('/'), observations_output_file)) .unwrap_or(observations_output_file); - let mut virtual_text_files = self.files.clone(); + let output_extension = Path::new(&output_file) + .extension() + .and_then(|value| value.to_str()) + .unwrap_or(analysis_format.extension); + let virtual_input_file = "/input/genotypes".to_owned(); + let virtual_output_file = format!("/output/results.{output_extension}"); + let virtual_observations_file = "/work/observations.tsv".to_owned(); + let script_virtual_path = virtual_pipeline_path(&script_path, "analysis.py"); + let manifest_virtual_path = virtual_pipeline_path(manifest_path, "manifest.yaml"); + let mut virtual_text_files = BTreeMap::new(); + virtual_text_files.insert( + script_virtual_path.clone(), + self.text(&script_path)?.to_owned(), + ); + virtual_text_files.insert( + manifest_virtual_path.clone(), + self.text(manifest_path)?.to_owned(), + ); virtual_text_files.insert( - observations_file.clone(), + virtual_observations_file.clone(), bioscript_reporting::render_analysis_observations_tsv(preloaded_observations), ); + let mut asset_paths = BTreeMap::new(); + for asset in &interpretation.assets { + let asset_path = self.resolve(manifest_path, &asset.path)?; + let virtual_asset_path = virtual_pipeline_path(&asset_path, &asset.path); + virtual_text_files.insert( + virtual_asset_path.clone(), + self.text(&asset_path)?.to_owned(), + ); + asset_paths.insert(asset.id.clone(), virtual_asset_path); + } let mut virtual_binary_files = BTreeMap::new(); - virtual_binary_files.insert(input_name.to_owned(), input_bytes.to_vec()); + virtual_binary_files.insert(virtual_input_file.clone(), input_bytes.to_vec()); let limits = ResourceLimits::new() .max_duration(Duration::from_millis(options.analysis_max_duration_ms)) .max_memory(16 * 1024 * 1024) @@ -96,24 +123,39 @@ impl PackageWorkspace { RuntimeConfig { limits, loader: loader.clone(), - context: BTreeMap::new(), + context: analysis_context( + participant_id, + &virtual_input_file, + &script_virtual_path, + &manifest_virtual_path, + &asset_paths, + &virtual_observations_file, + &virtual_output_file, + ), virtual_binary_files, - virtual_text_files: std::mem::take(&mut virtual_text_files), + virtual_text_files, preloaded_observations: preloaded_observations.to_vec(), }, ) .map_err(|err| JsError::new(&format!("create analysis runtime failed: {err:?}")))?; runtime .run_file( - &script_path, + &script_virtual_path, None, vec![ - ("input_file", MontyObject::String(input_name.to_owned())), - ("output_file", MontyObject::String(output_file.clone())), + ( + "input_file", + MontyObject::String(virtual_input_file.clone()), + ), + ( + "output_file", + MontyObject::String(virtual_output_file.clone()), + ), ( "observations_file", - MontyObject::String(observations_file.clone()), + MontyObject::String(virtual_observations_file.clone()), ), + ("asset_paths", monty_string_dict(&asset_paths)), ( "participant_id", MontyObject::String(participant_id.to_owned()), @@ -124,9 +166,9 @@ impl PackageWorkspace { JsError::new(&format!("analysis {} failed: {err:?}", interpretation.id)) })?; let written = runtime.virtual_written_text_files(); - let text = written.get(&output_file).ok_or_else(|| { + let text = written.get(&virtual_output_file).ok_or_else(|| { JsError::new(&format!( - "analysis {} did not write {output_file}", + "analysis {} did not write {virtual_output_file}", interpretation.id )) })?; @@ -140,7 +182,7 @@ impl PackageWorkspace { manifest_path, script_path: &script_path, output_file: &output_file, - observations_file: Some(&observations_file), + observations_file: Some(&virtual_observations_file), row_headers, rows, }, @@ -149,3 +191,65 @@ impl PackageWorkspace { Ok(outputs) } } + +fn virtual_pipeline_path(path: &str, fallback: &str) -> String { + let name = Path::new(path) + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or(fallback); + format!("/input/pipeline/{name}") +} + +fn analysis_context( + participant_id: &str, + input_file: &str, + script_path: &str, + manifest_path: &str, + asset_paths: &BTreeMap, + observations_file: &str, + output_file: &str, +) -> BTreeMap { + BTreeMap::from([ + ( + "participant_id".to_owned(), + MontyObject::String(participant_id.to_owned()), + ), + ( + "input_files".to_owned(), + monty_string_dict(&BTreeMap::from([( + "genotypes".to_owned(), + input_file.to_owned(), + )])), + ), + ( + "pipeline_files".to_owned(), + monty_string_dict(&BTreeMap::from([ + ("manifest".to_owned(), manifest_path.to_owned()), + ("analysis".to_owned(), script_path.to_owned()), + ])), + ), + ("assets".to_owned(), monty_string_dict(asset_paths)), + ( + "observations_file".to_owned(), + MontyObject::String(observations_file.to_owned()), + ), + ( + "output_file".to_owned(), + MontyObject::String(output_file.to_owned()), + ), + ]) +} + +fn monty_string_dict(values: &BTreeMap) -> MontyObject { + MontyObject::Dict( + values + .iter() + .map(|(key, value)| { + ( + MontyObject::String(key.clone()), + MontyObject::String(value.clone()), + ) + }) + .collect(), + ) +} From bcb2dbab0124e086f087ff6664f7489608bf2c14 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Fri, 15 May 2026 13:16:16 +1000 Subject: [PATCH 3/5] adding variant catalogue --- rust/bioscript-core/src/observation.rs | 3 +- rust/bioscript-formats/src/genotype.rs | 34 +++++++++++++-- .../src/genotype/backends.rs | 28 ++++++++++++- .../src/genotype/delimited/scan.rs | 41 ++++++++++++++++++- .../bioscript-formats/src/genotype/loaders.rs | 17 +++++++- rust/bioscript-formats/src/genotype/types.rs | 2 + rust/bioscript-formats/tests/file_formats.rs | 2 +- .../tests/file_formats/delimited.rs | 27 ++++++++++-- .../tests/file_formats/zip_and_fixtures.rs | 9 +++- rust/bioscript-reporting/src/html.rs | 3 ++ .../src/html/observations.rs | 11 ++++- .../src/manifest_catalogue.rs | 9 +++- rust/bioscript-reporting/src/observation.rs | 38 +++++++++++++++++ 13 files changed, 207 insertions(+), 17 deletions(-) diff --git a/rust/bioscript-core/src/observation.rs b/rust/bioscript-core/src/observation.rs index 40e7da9..79e8f26 100644 --- a/rust/bioscript-core/src/observation.rs +++ b/rust/bioscript-core/src/observation.rs @@ -94,12 +94,13 @@ pub struct ObservationRecord { pub facets: Option, } -pub const OBSERVATION_TSV_HEADERS: [&str; 29] = [ +pub const OBSERVATION_TSV_HEADERS: [&str; 30] = [ "participant_id", "assay_id", "assay_version", "variant_key", "rsid", + "gene", "assembly", "chrom", "pos_start", diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index ff58d03..6671709 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -91,6 +91,7 @@ impl GenotypeStore { format: GenotypeSourceFormat::Text, values: HashMap::new(), locus_values: HashMap::new(), + assembly: None, source_lines: HashMap::new(), }), } @@ -105,8 +106,9 @@ impl GenotypeStore { path, GenotypeSourceFormat::Text, None, + options, )), - GenotypeSourceFormat::Zip => Self::from_zip_file(path), + GenotypeSourceFormat::Zip => Self::from_zip_file(path, options), GenotypeSourceFormat::Vcf => Ok(Self::from_vcf_file(path, options)), GenotypeSourceFormat::Cram => Self::from_cram_file(path, options), GenotypeSourceFormat::Bam => Err(RuntimeError::Unsupported(format!( @@ -183,7 +185,7 @@ impl GenotypeStore { } } - fn from_zip_file(path: &Path) -> Result { + fn from_zip_file(path: &Path, options: &GenotypeLoadOptions) -> Result { let selected = select_zip_entry(path)?; let lower = selected.to_ascii_lowercase(); if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") { @@ -212,6 +214,7 @@ impl GenotypeStore { path, GenotypeSourceFormat::Zip, Some(selected), + options, )) } @@ -244,12 +247,14 @@ impl GenotypeStore { path: &Path, format: GenotypeSourceFormat, zip_entry_name: Option, + options: &GenotypeLoadOptions, ) -> Self { Self { backend: QueryBackend::Delimited(DelimitedBackend { format, path: path.to_path_buf(), zip_entry_name, + options: options.clone(), }), } } @@ -1115,6 +1120,10 @@ mod tests { format: GenotypeSourceFormat::Text, path: text.clone(), zip_entry_name: None, + options: GenotypeLoadOptions { + assembly: Some(Assembly::Grch38), + ..GenotypeLoadOptions::default() + }, }; let variants = vec![ VariantSpec { @@ -1147,6 +1156,21 @@ mod tests { Some("CT") ); + let unknown_assembly_backend = DelimitedBackend { + format: GenotypeSourceFormat::Text, + path: text.clone(), + zip_entry_name: None, + options: GenotypeLoadOptions::default(), + }; + let unknown_assembly_err = + scan_delimited_variants(&unknown_assembly_backend, &variants[1..2]).unwrap_err(); + assert!( + unknown_assembly_err + .to_string() + .contains("delimited genotype input assembly is unknown"), + "{unknown_assembly_err}" + ); + let zip_path = dir.join("sample.zip"); let cursor = std::io::Cursor::new(Vec::new()); let mut writer = zip::ZipWriter::new(cursor); @@ -1191,8 +1215,9 @@ mod tests { let unsupported_backend = DelimitedBackend { format: GenotypeSourceFormat::Vcf, - path: text, + path: text.clone(), zip_entry_name: None, + options: GenotypeLoadOptions::default(), }; let err = scan_delimited_variants(&unsupported_backend, &variants).unwrap_err(); assert!( @@ -1393,6 +1418,7 @@ mod tests { format: GenotypeSourceFormat::Zip, path: zip_path.clone(), zip_entry_name: None, + options: GenotypeLoadOptions::default(), }; let err = scan_delimited_variants(&bad_zip_backend, &[VariantSpec::default()]).unwrap_err(); assert!( @@ -1405,6 +1431,7 @@ mod tests { format: GenotypeSourceFormat::Zip, path: zip_path, zip_entry_name: Some("missing.csv".to_owned()), + options: GenotypeLoadOptions::default(), }; let err = scan_delimited_variants(&bad_entry_backend, &[VariantSpec::default()]).unwrap_err(); @@ -1417,6 +1444,7 @@ mod tests { format: GenotypeSourceFormat::Text, path: dir.join("missing.txt"), zip_entry_name: None, + options: GenotypeLoadOptions::default(), }; let err = scan_delimited_variants(&missing_text_backend, &[VariantSpec::default()]).unwrap_err(); diff --git a/rust/bioscript-formats/src/genotype/backends.rs b/rust/bioscript-formats/src/genotype/backends.rs index 659328d..fa99bd5 100644 --- a/rust/bioscript-formats/src/genotype/backends.rs +++ b/rust/bioscript-formats/src/genotype/backends.rs @@ -1,4 +1,4 @@ -use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; +use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation, VariantSpec}; use super::{ describe_query, lookup_indexed_vcf_variants, scan_delimited_variants, scan_vcf_variants, @@ -39,7 +39,7 @@ impl RsidMapBackend { } } - if let Some(locus) = variant.grch38.as_ref().or(variant.grch37.as_ref()) + if let Some(locus) = delimited_locus_for_assembly(variant, self.assembly) && let Some((value, matched_rsid, source)) = self.locus_values.get(&( locus.chrom.trim_start_matches("chr").to_ascii_lowercase(), locus.start, @@ -57,6 +57,19 @@ impl RsidMapBackend { }); } + if variant.has_coordinates() + && self.assembly.is_none() + && matches!( + self.format, + GenotypeSourceFormat::Text | GenotypeSourceFormat::Zip + ) + { + return Err(RuntimeError::Unsupported(format!( + "delimited genotype input assembly is unknown; refusing coordinate lookup for {}", + describe_query(variant) + ))); + } + let evidence = if variant.has_coordinates() { format!( "no matching rsid or locus found for {}", @@ -73,6 +86,17 @@ impl RsidMapBackend { } } +pub(crate) fn delimited_locus_for_assembly( + variant: &VariantSpec, + assembly: Option, +) -> Option<&GenomicLocus> { + match assembly { + Some(Assembly::Grch37) => variant.grch37.as_ref(), + Some(Assembly::Grch38) => variant.grch38.as_ref(), + None => None, + } +} + impl DelimitedBackend { pub(super) fn backend_name(&self) -> &'static str { match self.format { diff --git a/rust/bioscript-formats/src/genotype/delimited/scan.rs b/rust/bioscript-formats/src/genotype/delimited/scan.rs index 1b41b82..1a440da 100644 --- a/rust/bioscript-formats/src/genotype/delimited/scan.rs +++ b/rust/bioscript-formats/src/genotype/delimited/scan.rs @@ -6,10 +6,14 @@ use std::{ use zip::ZipArchive; +use crate::inspect::detect_assembly; use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; use super::{ - super::{GenotypeSourceFormat, describe_query, types::DelimitedBackend, variant_sort_key}, + super::{ + GenotypeSourceFormat, backends::delimited_locus_for_assembly, describe_query, + types::DelimitedBackend, variant_sort_key, + }, DelimitedColumnIndexes, detect_delimiter, parse_streaming_row, }; @@ -25,11 +29,14 @@ pub(crate) fn scan_delimited_variants( let mut results = vec![VariantObservation::default(); variants.len()]; let mut unresolved = variants.len(); + let has_coordinate_queries = variants.iter().any(VariantSpec::has_coordinates); + let mut detected_assembly = backend.options.assembly; + for (idx, variant) in &indexed { for rsid in &variant.rsids { rsid_targets.entry(rsid.clone()).or_default().push(*idx); } - if let Some(locus) = variant.grch38.as_ref().or(variant.grch37.as_ref()) { + if let Some(locus) = delimited_locus_for_assembly(variant, detected_assembly) { coord_targets .entry(( locus.chrom.trim_start_matches("chr").to_ascii_lowercase(), @@ -58,6 +65,36 @@ pub(crate) fn scan_delimited_variants( } let delimiter = detect_delimiter(&probe_lines); + if detected_assembly.is_none() { + let mut label = backend + .path + .to_string_lossy() + .to_ascii_lowercase() + .to_string(); + if let Some(entry_name) = backend.zip_entry_name.as_ref() { + label.push('\n'); + label.push_str(&entry_name.to_ascii_lowercase()); + } + detected_assembly = detect_assembly(&label, &probe_lines); + if let Some(assembly) = detected_assembly { + for (idx, variant) in &indexed { + if let Some(locus) = delimited_locus_for_assembly(variant, Some(assembly)) { + coord_targets + .entry(( + locus.chrom.trim_start_matches("chr").to_ascii_lowercase(), + locus.start, + )) + .or_default() + .push(*idx); + } + } + } else if has_coordinate_queries { + return Err(RuntimeError::Unsupported(format!( + "delimited genotype input assembly is unknown for {}; refusing coordinate lookup", + backend.path.display() + ))); + } + } let mut column_indexes: Option = None; let mut comment_header: Option> = None; diff --git a/rust/bioscript-formats/src/genotype/loaders.rs b/rust/bioscript-formats/src/genotype/loaders.rs index 079a069..feed1db 100644 --- a/rust/bioscript-formats/src/genotype/loaders.rs +++ b/rust/bioscript-formats/src/genotype/loaders.rs @@ -1,6 +1,8 @@ use std::{collections::HashMap, io::BufRead}; -use bioscript_core::RuntimeError; +use bioscript_core::{Assembly, RuntimeError}; + +use crate::inspect::detect_assembly; use super::{ COMMENT_PREFIXES, GenotypeSourceFormat, GenotypeStore, QueryBackend, RowParser, RsidMapBackend, @@ -28,6 +30,7 @@ pub(crate) fn from_vcf_reader( GenotypeSourceFormat::Vcf, values, HashMap::new(), + None, HashMap::new(), )) } @@ -64,6 +67,7 @@ pub(crate) fn from_delimited_reader( } let mut parser = RowParser::new(delimiter.unwrap_or(super::Delimiter::Tab)); + let assembly = detect_assembly(&label.to_ascii_lowercase(), &prelude); let mut values = HashMap::new(); let mut locus_values = HashMap::new(); let mut source_lines = HashMap::new(); @@ -93,7 +97,13 @@ pub(crate) fn from_delimited_reader( )?; } - Ok(from_rsid_map(format, values, locus_values, source_lines)) + Ok(from_rsid_map( + format, + values, + locus_values, + assembly, + source_lines, + )) } pub(crate) fn from_vcf_lines(lines: Vec) -> Result { @@ -105,6 +115,7 @@ pub(crate) fn from_vcf_lines(lines: Vec) -> Result, locus_values: HashMap<(String, i64), (String, Option, String)>, + assembly: Option, source_lines: HashMap, ) -> GenotypeStore { GenotypeStore { @@ -178,6 +190,7 @@ fn from_rsid_map( format, values, locus_values, + assembly, source_lines, }), } diff --git a/rust/bioscript-formats/src/genotype/types.rs b/rust/bioscript-formats/src/genotype/types.rs index 60b0cd7..67d5473 100644 --- a/rust/bioscript-formats/src/genotype/types.rs +++ b/rust/bioscript-formats/src/genotype/types.rs @@ -35,6 +35,7 @@ pub(crate) struct RsidMapBackend { pub(crate) format: GenotypeSourceFormat, pub(crate) values: HashMap, pub(crate) locus_values: HashMap<(String, i64), (String, Option, String)>, + pub(crate) assembly: Option, /// Original input line per rsid, retained so wasm-side `from_bytes` loads /// can emit the same `| source line: …` evidence that the CLI's /// path-backed `DelimitedBackend` does on every lookup. Empty for @@ -47,6 +48,7 @@ pub(crate) struct DelimitedBackend { pub(crate) format: GenotypeSourceFormat, pub(crate) path: PathBuf, pub(crate) zip_entry_name: Option, + pub(crate) options: GenotypeLoadOptions, } #[derive(Debug, Clone)] diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index 104ea84..a561b34 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -5,7 +5,7 @@ use std::{ time::{SystemTime, UNIX_EPOCH}, }; -use bioscript_core::{VariantKind, VariantSpec}; +use bioscript_core::{Assembly, VariantKind, VariantSpec}; use bioscript_formats::{ GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, alignment, }; diff --git a/rust/bioscript-formats/tests/file_formats/delimited.rs b/rust/bioscript-formats/tests/file_formats/delimited.rs index fd9a733..15b450b 100644 --- a/rust/bioscript-formats/tests/file_formats/delimited.rs +++ b/rust/bioscript-formats/tests/file_formats/delimited.rs @@ -14,7 +14,14 @@ fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { ) .unwrap(); - let store = GenotypeStore::from_file(&path).unwrap(); + let store = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + assembly: Some(Assembly::Grch38), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("--")); @@ -55,7 +62,14 @@ fn delimited_parser_uses_comment_headers_aliases_quotes_and_extra_columns() { ) .unwrap(); - let store = GenotypeStore::from_file(&path).unwrap(); + let store = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + assembly: Some(Assembly::Grch38), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); assert_eq!(store.get("rsQuoted").unwrap().as_deref(), Some("AT")); assert_eq!(store.get("rsSlash").unwrap().as_deref(), Some("ID")); @@ -87,7 +101,14 @@ fn delimited_parser_handles_space_delimited_rows_without_headers_and_inline_comm ) .unwrap(); - let store = GenotypeStore::from_file(&path).unwrap(); + let store = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + assembly: Some(Assembly::Grch38), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); assert_eq!(store.get("rsSpace").unwrap().as_deref(), Some("CT")); let observation = store diff --git a/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs index d814223..68b8af9 100644 --- a/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs +++ b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs @@ -11,7 +11,14 @@ fn batch_lookup_preserves_input_order_after_coordinate_sorting() { rs1\t1\t10\tAG\n", ) .unwrap(); - let store = GenotypeStore::from_file(&path).unwrap(); + let store = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + assembly: Some(Assembly::Grch38), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); let results = store .lookup_variants(&[ diff --git a/rust/bioscript-reporting/src/html.rs b/rust/bioscript-reporting/src/html.rs index 608f1b3..d536f26 100644 --- a/rust/bioscript-reporting/src/html.rs +++ b/rust/bioscript-reporting/src/html.rs @@ -25,6 +25,9 @@ pub fn render_app_html_document( let mut out = String::from( r##"BioScript report
"##, ); + out.push_str( + r#""#, + ); let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); let has_pgx_findings = !label_findings.is_empty() || !summary_findings.is_empty(); diff --git a/rust/bioscript-reporting/src/html/observations.rs b/rust/bioscript-reporting/src/html/observations.rs index a5cab20..1c9ef86 100644 --- a/rust/bioscript-reporting/src/html/observations.rs +++ b/rust/bioscript-reporting/src/html/observations.rs @@ -170,7 +170,7 @@ pub(super) fn render_observation_cell( return; } if header == "ref_alt" { - class_cell(out, &observation_ref_alt(observation), "mono"); + ref_alt_cell(out, observation); return; } if header == "allele_balance" { @@ -230,6 +230,15 @@ pub(super) fn render_observation_cell( ); } +fn ref_alt_cell(out: &mut String, observation: &serde_json::Value) { + let value = observation_ref_alt(observation); + let escaped_value = html_escape(&value); + let _ = write!( + out, + "{escaped_value}" + ); +} + pub(super) fn observation_is_imputed_vcf_reference(observation: &serde_json::Value) -> bool { observation .get("evidence_raw") diff --git a/rust/bioscript-reporting/src/manifest_catalogue.rs b/rust/bioscript-reporting/src/manifest_catalogue.rs index 20666bb..e7db82f 100644 --- a/rust/bioscript-reporting/src/manifest_catalogue.rs +++ b/rust/bioscript-reporting/src/manifest_catalogue.rs @@ -111,10 +111,17 @@ fn catalogue_row_task( idx + 2 )); } + let mut tags = catalogue_tags.to_vec(); + if let Some(gene) = columns.value(row, "gene").filter(|value| !value.is_empty()) { + let gene_tag = format!("gene:{gene}"); + if !tags.iter().any(|tag| tag == &gene_tag) { + tags.push(gene_tag); + } + } let manifest = VariantManifest { path: PathBuf::from(format!("{catalogue_path}#{variant_id}")), name, - tags: catalogue_tags.to_vec(), + tags, spec, }; Ok(VariantManifestTask { diff --git a/rust/bioscript-reporting/src/observation.rs b/rust/bioscript-reporting/src/observation.rs index 38e4ced..8fd79ba 100644 --- a/rust/bioscript-reporting/src/observation.rs +++ b/rust/bioscript-reporting/src/observation.rs @@ -229,6 +229,16 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { weak_indel_match, zygosity, } = input; + let gene = if gene.is_empty() { + manifest_gene_from_tags(&manifest).unwrap_or_default() + } else { + gene + }; + let source = if source.is_null() { + manifest_default_source(&row, &manifest) + } else { + source + }; serde_json::json!({ "participant_id": row.get("participant_id").cloned().unwrap_or_default(), "assay_id": assay_id, @@ -269,6 +279,34 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { }) } +fn manifest_gene_from_tags(manifest: &VariantManifest) -> Option { + manifest.tags.iter().find_map(|tag| { + tag.strip_prefix("gene:") + .filter(|gene| !gene.is_empty()) + .map(ToOwned::to_owned) + }) +} + +fn manifest_default_source( + row: &BTreeMap, + manifest: &VariantManifest, +) -> serde_json::Value { + let rsid = row + .get("matched_rsid") + .filter(|rsid| !rsid.is_empty()) + .or_else(|| row.get("rsid").filter(|rsid| !rsid.is_empty())) + .or_else(|| manifest.spec.rsids.first().filter(|rsid| !rsid.is_empty())); + let Some(rsid) = rsid else { + return serde_json::Value::Null; + }; + serde_json::json!({ + "kind": "database", + "label": "dbSNP / NCBI SNP", + "url": format!("https://www.ncbi.nlm.nih.gov/snp/{rsid}"), + "fields": ["identifiers.rsids"], + }) +} + fn assembly_row_value(assembly: Assembly) -> String { match assembly { Assembly::Grch37 => "grch37".to_owned(), From d79a3f493d5b8a940ede975b667ea0b60760dda1 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Fri, 15 May 2026 14:06:24 +1000 Subject: [PATCH 4/5] fixing linting --- .../src/genotype/delimited/scan.rs | 6 +- rust/bioscript-reporting/src/html.rs | 2 +- rust/bioscript-reporting/src/observation.rs | 214 +----------------- .../src/observation/genotype_display.rs | 207 +++++++++++++++++ 4 files changed, 218 insertions(+), 211 deletions(-) create mode 100644 rust/bioscript-reporting/src/observation/genotype_display.rs diff --git a/rust/bioscript-formats/src/genotype/delimited/scan.rs b/rust/bioscript-formats/src/genotype/delimited/scan.rs index 87d9a87..a9248dd 100644 --- a/rust/bioscript-formats/src/genotype/delimited/scan.rs +++ b/rust/bioscript-formats/src/genotype/delimited/scan.rs @@ -68,11 +68,7 @@ pub(crate) fn scan_delimited_variants( let delimiter = detect_delimiter(&probe_lines); if detected_assembly.is_none() { - let mut label = backend - .path - .to_string_lossy() - .to_ascii_lowercase() - .to_string(); + let mut label = backend.path.to_string_lossy().to_ascii_lowercase(); if let Some(entry_name) = backend.zip_entry_name.as_ref() { label.push('\n'); label.push_str(&entry_name.to_ascii_lowercase()); diff --git a/rust/bioscript-reporting/src/html.rs b/rust/bioscript-reporting/src/html.rs index 7bd0293..29f978c 100644 --- a/rust/bioscript-reporting/src/html.rs +++ b/rust/bioscript-reporting/src/html.rs @@ -26,7 +26,7 @@ pub fn render_app_html_document( r##"BioScript report
"##, ); out.push_str( - r#""#, + r"", ); let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); diff --git a/rust/bioscript-reporting/src/observation.rs b/rust/bioscript-reporting/src/observation.rs index 337602c..1341143 100644 --- a/rust/bioscript-reporting/src/observation.rs +++ b/rust/bioscript-reporting/src/observation.rs @@ -1,15 +1,20 @@ use std::collections::BTreeMap; -use bioscript_core::{Assembly, GenomicLocus, VariantKind}; -use bioscript_formats::{InferredSex, SexDetectionConfidence, SexInference}; +use bioscript_core::{Assembly, GenomicLocus}; +use bioscript_formats::SexInference; use bioscript_schema::VariantManifest; mod facets; +mod genotype_display; use facets::{ classify_non_reportable_alleles, is_weak_delimited_indel_match, observation_facets, parse_optional_u32, }; +use genotype_display::{ + assembly_row_value, deletion_copy_number_display, genotype_display_from_raw_counts, + hemizygous_display_genotype, normalize_app_genotype, observation_evidence_raw, +}; pub struct AppObservationInput<'a> { pub row: &'a BTreeMap, @@ -307,212 +312,11 @@ fn manifest_default_source( }) } -fn assembly_row_value(assembly: Assembly) -> String { - match assembly { - Assembly::Grch37 => "grch37".to_owned(), - Assembly::Grch38 => "grch38".to_owned(), - } -} - -fn hemizygous_display_genotype(display: &str) -> String { - display - .chars() - .find(char::is_ascii_alphabetic) - .map_or_else(|| display.to_owned(), |allele| allele.to_string()) -} - -fn deletion_copy_number_display( - row: &BTreeMap, - manifest: &VariantManifest, - depth: Option, - alt_count: Option, -) -> Option { - if !matches!(manifest.spec.kind, Some(VariantKind::Deletion)) { - return None; - } - if !matches!(row.get("backend").map(String::as_str), Some("cram" | "bam")) { - return None; - } - if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { - return None; - } - let depth = depth?; - if depth == 0 { - return None; - } - let alt_fraction = f64::from(alt_count.unwrap_or(0)) / f64::from(depth); - if alt_fraction >= 0.8 { - Some("DD".to_owned()) - } else if alt_fraction <= 0.2 { - Some("II".to_owned()) - } else { - Some("DI".to_owned()) - } -} - -fn normalize_app_genotype( - display: &str, - ref_allele: &str, - alt_allele: &str, - kind: Option, - chrom: &str, - inferred_sex: Option<&SexInference>, -) -> (String, String) { - if display.is_empty() { - return ("./.".to_owned(), "unknown".to_owned()); - } - if matches!(kind, Some(VariantKind::Deletion)) - && ref_allele.len() != 1 - && display - .chars() - .filter(char::is_ascii_alphabetic) - .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) - { - return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); - } - let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); - if ref_allele.len() != 1 || alt_allele.len() != 1 { - return (display.to_owned(), "unknown".to_owned()); - } - let ref_ch = ref_allele.chars().next().unwrap_or_default(); - let alt_ch = alt_allele.chars().next().unwrap_or_default(); - if alleles.len() == 1 && is_haploid_sex_chromosome(chrom) { - let allele = alleles[0]; - if allele == ref_ch { - return ("0".to_owned(), "hem_ref".to_owned()); - } - if allele == alt_ch { - return ("1".to_owned(), "hem_alt".to_owned()); - } - return (display.to_owned(), "unknown".to_owned()); - } - if alleles.len() != 2 { - return (display.to_owned(), "unknown".to_owned()); - } - if is_confident_male_sex_chromosome(chrom, inferred_sex) && alleles[0] == alleles[1] { - let allele = alleles[0]; - if allele == ref_ch { - return ("0".to_owned(), "hem_ref".to_owned()); - } - if allele == alt_ch { - return ("1".to_owned(), "hem_alt".to_owned()); - } - return (display.to_owned(), "unknown".to_owned()); - } - let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); - let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); - match (ref_count, alt_count) { - (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), - (1, 1) => ("0/1".to_owned(), "het".to_owned()), - (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), - _ => (display.to_owned(), "unknown".to_owned()), - } -} - -fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { - is_haploid_sex_chromosome(chrom) - && inferred_sex.is_some_and(|sex| { - sex.sex == InferredSex::Male - && matches!( - sex.confidence, - SexDetectionConfidence::High | SexDetectionConfidence::Medium - ) - }) -} - -fn is_haploid_sex_chromosome(chrom: &str) -> bool { - matches!( - chrom - .trim() - .trim_start_matches("chr") - .trim_start_matches("CHR") - .to_ascii_uppercase() - .as_str(), - "X" | "Y" | "23" | "24" - ) -} - -fn observation_evidence_raw( - row: &BTreeMap, - chrom: &str, - inferred_sex: Option<&SexInference>, -) -> String { - let mut evidence_raw = row.get("evidence").cloned().unwrap_or_default(); - if !is_haploid_sex_chromosome(chrom) { - return evidence_raw; - } - let Some(inferred_sex) = inferred_sex else { - return evidence_raw; - }; - let sex_evidence = sex_inference_evidence_raw(inferred_sex); - if sex_evidence.is_empty() { - return evidence_raw; - } - if evidence_raw.is_empty() { - evidence_raw = sex_evidence; - } else { - evidence_raw.push_str(" | "); - evidence_raw.push_str(&sex_evidence); - } - evidence_raw -} - -fn sex_inference_evidence_raw(inferred_sex: &SexInference) -> String { - let sex = match inferred_sex.sex { - InferredSex::Male => "male", - InferredSex::Female => "female", - InferredSex::Unknown => "unknown", - }; - let confidence = match inferred_sex.confidence { - SexDetectionConfidence::High => "high", - SexDetectionConfidence::Medium => "medium", - SexDetectionConfidence::Low => "low", - }; - let mut fields = vec![ - format!("detected_sex={sex}"), - format!("sex_confidence={confidence}"), - format!("sex_method={}", inferred_sex.method), - ]; - fields.extend( - inferred_sex - .evidence - .iter() - .map(|item| format!("sex_{item}")), - ); - fields.join(" ") -} - -fn genotype_display_from_raw_counts(raw_counts: &str) -> Option { - let counts: serde_json::Map = - serde_json::from_str(raw_counts).ok()?; - let mut items = counts - .into_iter() - .filter_map(|(base, count)| { - let base = base.chars().next()?.to_ascii_uppercase(); - let count = count.as_u64()?; - if matches!(base, 'A' | 'C' | 'G' | 'T') && count > 0 { - Some((base, count)) - } else { - None - } - }) - .collect::>(); - if items.is_empty() { - return None; - } - items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); - let total = items.iter().map(|(_, count)| *count).sum::(); - let (top_base, top_count) = items[0]; - if total == 0 || items.len() == 1 || top_count.saturating_mul(10) >= total.saturating_mul(8) { - return Some(format!("{top_base}{top_base}")); - } - Some(format!("{}{}", top_base, items[1].0)) -} - #[cfg(test)] mod tests { use super::*; - use bioscript_core::VariantSpec; + use bioscript_core::{VariantKind, VariantSpec}; + use bioscript_formats::{InferredSex, SexDetectionConfidence}; use std::path::PathBuf; #[test] diff --git a/rust/bioscript-reporting/src/observation/genotype_display.rs b/rust/bioscript-reporting/src/observation/genotype_display.rs new file mode 100644 index 0000000..c944d2e --- /dev/null +++ b/rust/bioscript-reporting/src/observation/genotype_display.rs @@ -0,0 +1,207 @@ +use std::collections::BTreeMap; + +use bioscript_core::{Assembly, VariantKind}; +use bioscript_formats::{InferredSex, SexDetectionConfidence, SexInference}; +use bioscript_schema::VariantManifest; + +pub(super) fn assembly_row_value(assembly: Assembly) -> String { + match assembly { + Assembly::Grch37 => "grch37".to_owned(), + Assembly::Grch38 => "grch38".to_owned(), + } +} + +pub(super) fn hemizygous_display_genotype(display: &str) -> String { + display + .chars() + .find(char::is_ascii_alphabetic) + .map_or_else(|| display.to_owned(), |allele| allele.to_string()) +} + +pub(super) fn deletion_copy_number_display( + row: &BTreeMap, + manifest: &VariantManifest, + depth: Option, + alt_count: Option, +) -> Option { + if !matches!(manifest.spec.kind, Some(VariantKind::Deletion)) { + return None; + } + if !matches!(row.get("backend").map(String::as_str), Some("cram" | "bam")) { + return None; + } + if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { + return None; + } + let depth = depth?; + if depth == 0 { + return None; + } + let alt_fraction = f64::from(alt_count.unwrap_or(0)) / f64::from(depth); + if alt_fraction >= 0.8 { + Some("DD".to_owned()) + } else if alt_fraction <= 0.2 { + Some("II".to_owned()) + } else { + Some("DI".to_owned()) + } +} + +pub(super) fn normalize_app_genotype( + display: &str, + ref_allele: &str, + alt_allele: &str, + kind: Option, + chrom: &str, + inferred_sex: Option<&SexInference>, +) -> (String, String) { + if display.is_empty() { + return ("./.".to_owned(), "unknown".to_owned()); + } + if matches!(kind, Some(VariantKind::Deletion)) + && ref_allele.len() != 1 + && display + .chars() + .filter(char::is_ascii_alphabetic) + .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) + { + return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); + } + let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); + if ref_allele.len() != 1 || alt_allele.len() != 1 { + return (display.to_owned(), "unknown".to_owned()); + } + let ref_ch = ref_allele.chars().next().unwrap_or_default(); + let alt_ch = alt_allele.chars().next().unwrap_or_default(); + if alleles.len() == 1 && is_haploid_sex_chromosome(chrom) { + let allele = alleles[0]; + if allele == ref_ch { + return ("0".to_owned(), "hem_ref".to_owned()); + } + if allele == alt_ch { + return ("1".to_owned(), "hem_alt".to_owned()); + } + return (display.to_owned(), "unknown".to_owned()); + } + if alleles.len() != 2 { + return (display.to_owned(), "unknown".to_owned()); + } + if is_confident_male_sex_chromosome(chrom, inferred_sex) && alleles[0] == alleles[1] { + let allele = alleles[0]; + if allele == ref_ch { + return ("0".to_owned(), "hem_ref".to_owned()); + } + if allele == alt_ch { + return ("1".to_owned(), "hem_alt".to_owned()); + } + return (display.to_owned(), "unknown".to_owned()); + } + let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); + let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); + match (ref_count, alt_count) { + (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), + (1, 1) => ("0/1".to_owned(), "het".to_owned()), + (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), + _ => (display.to_owned(), "unknown".to_owned()), + } +} + +fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { + is_haploid_sex_chromosome(chrom) + && inferred_sex.is_some_and(|sex| { + sex.sex == InferredSex::Male + && matches!( + sex.confidence, + SexDetectionConfidence::High | SexDetectionConfidence::Medium + ) + }) +} + +fn is_haploid_sex_chromosome(chrom: &str) -> bool { + matches!( + chrom + .trim() + .trim_start_matches("chr") + .trim_start_matches("CHR") + .to_ascii_uppercase() + .as_str(), + "X" | "Y" | "23" | "24" + ) +} + +pub(super) fn observation_evidence_raw( + row: &BTreeMap, + chrom: &str, + inferred_sex: Option<&SexInference>, +) -> String { + let mut evidence_raw = row.get("evidence").cloned().unwrap_or_default(); + if !is_haploid_sex_chromosome(chrom) { + return evidence_raw; + } + let Some(inferred_sex) = inferred_sex else { + return evidence_raw; + }; + let sex_evidence = sex_inference_evidence_raw(inferred_sex); + if sex_evidence.is_empty() { + return evidence_raw; + } + if evidence_raw.is_empty() { + evidence_raw = sex_evidence; + } else { + evidence_raw.push_str(" | "); + evidence_raw.push_str(&sex_evidence); + } + evidence_raw +} + +fn sex_inference_evidence_raw(inferred_sex: &SexInference) -> String { + let sex = match inferred_sex.sex { + InferredSex::Male => "male", + InferredSex::Female => "female", + InferredSex::Unknown => "unknown", + }; + let confidence = match inferred_sex.confidence { + SexDetectionConfidence::High => "high", + SexDetectionConfidence::Medium => "medium", + SexDetectionConfidence::Low => "low", + }; + let mut fields = vec![ + format!("detected_sex={sex}"), + format!("sex_confidence={confidence}"), + format!("sex_method={}", inferred_sex.method), + ]; + fields.extend( + inferred_sex + .evidence + .iter() + .map(|item| format!("sex_{item}")), + ); + fields.join(" ") +} + +pub(super) fn genotype_display_from_raw_counts(raw_counts: &str) -> Option { + let counts: serde_json::Map = + serde_json::from_str(raw_counts).ok()?; + let mut items = counts + .into_iter() + .filter_map(|(base, count)| { + let base = base.chars().next()?.to_ascii_uppercase(); + let count = count.as_u64()?; + if matches!(base, 'A' | 'C' | 'G' | 'T') && count > 0 { + Some((base, count)) + } else { + None + } + }) + .collect::>(); + if items.is_empty() { + return None; + } + items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); + let total = items.iter().map(|(_, count)| *count).sum::(); + let (top_base, top_count) = items[0]; + if total == 0 || items.len() == 1 || top_count.saturating_mul(10) >= total.saturating_mul(8) { + return Some(format!("{top_base}{top_base}")); + } + Some(format!("{}{}", top_base, items[1].0)) +} From d8868640fb2e7b5d457ee0a49f203c1de3afe5ca Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Fri, 15 May 2026 14:08:54 +1000 Subject: [PATCH 5/5] linting --- rust/bioscript-cli/src/report_execution.rs | 72 +++++++++++++++------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs index 0c251f3..a9a5ff8 100644 --- a/rust/bioscript-cli/src/report_execution.rs +++ b/rust/bioscript-cli/src/report_execution.rs @@ -162,27 +162,17 @@ fn run_bioscript_analysis_script(input: &BioscriptAnalysisScriptInput<'_>) -> Re let virtual_output_file = format!("/output/results.{output_extension}"); let script_virtual_path = virtual_pipeline_path(input.script_path, "analysis.py"); let manifest_virtual_path = virtual_pipeline_path(input.manifest_path, "manifest.yaml"); - let mut virtual_text_files = BTreeMap::new(); - virtual_text_files.insert(script_virtual_path.clone(), script_text); - virtual_text_files.insert( - manifest_virtual_path.clone(), - fs::read_to_string(input.manifest_path).map_err(|err| { - format!( - "failed to read analysis manifest {}: {err}", - input.manifest_path.display() - ) - })?, - ); - virtual_text_files.insert(virtual_observations_file.clone(), observations_text); - let mut asset_paths = BTreeMap::new(); - for asset in &input.interpretation.assets { - let asset_path = resolve_manifest_path(input.runtime_root, input.manifest_path, &asset.path)?; - let virtual_asset_path = virtual_pipeline_path(&asset_path, &asset.path); - let text = fs::read_to_string(&asset_path) - .map_err(|err| format!("failed to read analysis asset {}: {err}", asset_path.display()))?; - virtual_text_files.insert(virtual_asset_path.clone(), text); - asset_paths.insert(asset.id.clone(), virtual_asset_path); - } + let AnalysisVirtualTextFiles { + text_files: virtual_text_files, + asset_paths, + } = collect_analysis_virtual_text_files( + input, + &script_virtual_path, + script_text, + &manifest_virtual_path, + &virtual_observations_file, + observations_text, + )?; let mut virtual_binary_files = BTreeMap::new(); virtual_binary_files.insert(virtual_input_file.clone(), input_bytes); let context = analysis_context( @@ -239,6 +229,46 @@ fn run_bioscript_analysis_script(input: &BioscriptAnalysisScriptInput<'_>) -> Re persist_virtual_output_files(&written, &virtual_output_file, input.output_file) } +struct AnalysisVirtualTextFiles { + text_files: BTreeMap, + asset_paths: BTreeMap, +} + +fn collect_analysis_virtual_text_files( + input: &BioscriptAnalysisScriptInput<'_>, + script_virtual_path: &str, + script_text: String, + manifest_virtual_path: &str, + virtual_observations_file: &str, + observations_text: String, +) -> Result { + let mut virtual_text_files = BTreeMap::new(); + virtual_text_files.insert(script_virtual_path.to_owned(), script_text); + virtual_text_files.insert( + manifest_virtual_path.to_owned(), + fs::read_to_string(input.manifest_path).map_err(|err| { + format!( + "failed to read analysis manifest {}: {err}", + input.manifest_path.display() + ) + })?, + ); + virtual_text_files.insert(virtual_observations_file.to_owned(), observations_text); + let mut asset_paths = BTreeMap::new(); + for asset in &input.interpretation.assets { + let asset_path = resolve_manifest_path(input.runtime_root, input.manifest_path, &asset.path)?; + let virtual_asset_path = virtual_pipeline_path(&asset_path, &asset.path); + let text = fs::read_to_string(&asset_path) + .map_err(|err| format!("failed to read analysis asset {}: {err}", asset_path.display()))?; + virtual_text_files.insert(virtual_asset_path.clone(), text); + asset_paths.insert(asset.id.clone(), virtual_asset_path); + } + Ok(AnalysisVirtualTextFiles { + text_files: virtual_text_files, + asset_paths, + }) +} + fn persist_virtual_output_files( written: &BTreeMap, primary_virtual_output_file: &str,