From 15aab453c7e1c484253d3c9a9e95700a6d0077c7 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Fri, 15 May 2026 14:41:59 +1000 Subject: [PATCH 1/3] fixing issues with new variant catalogue and paths --- rust/bioscript-formats/src/genotype.rs | 26 +++++++++++++++++++++++--- rust/bioscript-runtime/src/runtime.rs | 19 +++++++++++++++---- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 0e22caf..377f254 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -27,7 +27,10 @@ pub(crate) use delimited::{ COMMENT_PREFIXES, DelimitedColumnIndexes, Delimiter, detect_delimiter, parse_streaming_row, }; use delimited::{RowParser, scan_delimited_variants}; -use io::{detect_source_format, is_bgzf_path, read_lines_from_reader, select_zip_entry}; +use io::{ + detect_source_format, is_bgzf_path, looks_like_vcf_lines, read_lines_from_reader, + select_zip_entry, +}; pub use types::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, }; @@ -119,12 +122,16 @@ impl GenotypeStore { } pub fn from_bytes(name: &str, bytes: &[u8]) -> Result { + // The report pipeline hands us a fixed virtual path (`/input/genotypes`) + // with no extension, so we cannot rely on `name` alone for format + // detection the way `from_file_with_options` can. Sniff the leading + // bytes so a zip/VCF payload is recognised regardless of the name. let lower = name.to_ascii_lowercase(); - if lower.ends_with(".zip") { + if lower.ends_with(".zip") || bytes_look_like_zip(bytes) { return Self::from_zip_bytes(name, bytes); } let reader = BufReader::new(Cursor::new(bytes)); - if lower.ends_with(".vcf") { + if lower.ends_with(".vcf") || bytes_look_like_vcf(bytes) { return Self::from_vcf_reader(reader, name); } Self::from_delimited_reader(GenotypeSourceFormat::Text, reader, name) @@ -489,6 +496,19 @@ fn required_cache_miss(spec: &VariantSpec) -> RuntimeError { )) } +fn bytes_look_like_zip(bytes: &[u8]) -> bool { + bytes.starts_with(b"PK\x03\x04") + || bytes.starts_with(b"PK\x05\x06") + || bytes.starts_with(b"PK\x07\x08") +} + +fn bytes_look_like_vcf(bytes: &[u8]) -> bool { + let prefix = &bytes[..bytes.len().min(8192)]; + let text = String::from_utf8_lossy(prefix); + let lines: Vec = text.lines().map(str::to_owned).collect(); + looks_like_vcf_lines(&lines) +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index 69f98a1..01f4a7d 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -291,7 +291,7 @@ impl BioscriptRuntime { fn resolve_user_path(&self, raw_path: &str) -> Result { let path = Path::new(raw_path); - if path.is_absolute() { + if path_is_rooted(path) { if self.uses_virtual_files() { for component in path.components() { match component { @@ -327,7 +327,7 @@ impl BioscriptRuntime { if self.virtual_file_exists(raw_path) { return Ok(path); } - if self.uses_virtual_files() && Path::new(raw_path).is_absolute() { + if self.uses_virtual_files() && path_is_rooted(Path::new(raw_path)) { return Err(RuntimeError::Io(format!( "virtual file does not exist: {raw_path}" ))); @@ -342,7 +342,7 @@ impl BioscriptRuntime { fn resolve_user_write_path(&self, raw_path: &str) -> Result { let path = self.resolve_user_path(raw_path)?; if self.uses_virtual_files() { - if Path::new(raw_path).is_absolute() + if path_is_rooted(Path::new(raw_path)) && !(raw_path.starts_with("/output/") || raw_path.starts_with("/work/")) { return Err(RuntimeError::InvalidArguments(format!( @@ -489,12 +489,23 @@ impl BioscriptRuntime { } } +// Treat any path with a leading root (`/...`) as rooted/absolute. We cannot use +// `Path::is_absolute()` here: on `wasm32-unknown-unknown` (the wasm-pack build +// target) `is_absolute()` is gated to `unix`/`wasi` and always returns `false` +// for `/`-rooted paths, which would route the virtual report paths +// (`/input/...`, `/work/...`, `/output/...`) into the relative-path branch and +// reject their `RootDir` component as escaping the bioscript root. +// `Path::has_root()` parses components and is target-independent. +fn path_is_rooted(path: &Path) -> bool { + path.has_root() +} + fn resolve_optional_loader_path( runtime: &BioscriptRuntime, path: Option, ) -> Result, RuntimeError> { path.map(|path| { - if path.is_absolute() { + if path_is_rooted(&path) { Ok(path) } else { runtime.resolve_user_path(&path.to_string_lossy()) From cda62e3ff7b20795096ab423b3cda7a977f376e5 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Fri, 15 May 2026 15:07:02 +1000 Subject: [PATCH 2/3] fixes --- rust/bioscript-formats/src/genotype.rs | 4 +- rust/bioscript-formats/src/genotype/vcf.rs | 2 +- rust/bioscript-formats/src/lib.rs | 2 +- rust/bioscript-reporting/src/html.rs | 2 +- rust/bioscript-reporting/src/html/helpers.rs | 2 +- rust/bioscript-reporting/src/manifest.rs | 64 +++++++++++++++++++ rust/bioscript-wasm/src/report_api.rs | 7 +- .../src/report_input_inspection.rs | 29 +++++++++ rust/bioscript-wasm/src/report_workspace.rs | 1 - 9 files changed, 105 insertions(+), 8 deletions(-) diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 377f254..3999287 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -36,8 +36,8 @@ pub use types::{ }; use types::{CramBackend, DelimitedBackend, QueryBackend, RsidMapBackend, VcfBackend}; pub use vcf::{ - choose_variant_locus_for_assembly, imputed_reference_observation, observe_vcf_snp_with_reader, - observe_vcf_variant_with_reader, + choose_variant_locus_for_assembly, detect_vcf_assembly, imputed_reference_observation, + observe_vcf_snp_with_reader, observe_vcf_variant_with_reader, }; use vcf::{lookup_indexed_vcf_variants, scan_vcf_variants}; pub(crate) use vcf_tokens::genotype_from_vcf_gt; diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index 599b1d9..33a587c 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -429,6 +429,6 @@ pub(crate) fn extract_vcf_sample_genotype( genotype_from_vcf_gt(sample_gt, reference, &alternate_refs) } -pub(crate) fn detect_vcf_assembly(path: &Path, probe_lines: &[String]) -> Option { +pub fn detect_vcf_assembly(path: &Path, probe_lines: &[String]) -> Option { detect_assembly(&path.to_string_lossy().to_ascii_lowercase(), probe_lines) } diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index fe74db4..a3684ce 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -15,7 +15,7 @@ mod prepare; pub use genotype::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, - choose_variant_locus_for_assembly, imputed_reference_observation, + choose_variant_locus_for_assembly, detect_vcf_assembly, imputed_reference_observation, observe_cram_deletion_with_reader, observe_cram_indel_with_reader, observe_cram_snp_with_reader, observe_vcf_snp_with_reader, observe_vcf_variant_with_reader, }; diff --git a/rust/bioscript-reporting/src/html.rs b/rust/bioscript-reporting/src/html.rs index 29f978c..78f39e3 100644 --- a/rust/bioscript-reporting/src/html.rs +++ b/rust/bioscript-reporting/src/html.rs @@ -23,7 +23,7 @@ pub fn render_app_html_document( reports: &[serde_json::Value], ) -> Result { let mut out = String::from( - r##"BioScript report
"##, + r##"BioScript report
"##, ); out.push_str( r"", diff --git a/rust/bioscript-reporting/src/html/helpers.rs b/rust/bioscript-reporting/src/html/helpers.rs index 109dc88..6001d50 100644 --- a/rust/bioscript-reporting/src/html/helpers.rs +++ b/rust/bioscript-reporting/src/html/helpers.rs @@ -3,7 +3,7 @@ pub(super) fn render_table_start(out: &mut String, table_id: &str, headers: &[&s let escaped_id = html_escape(table_id); let _ = write!( out, - "
" + "
" ); for (index, header) in headers.iter().enumerate() { let _ = write!( diff --git a/rust/bioscript-reporting/src/manifest.rs b/rust/bioscript-reporting/src/manifest.rs index af52e93..b5cbff3 100644 --- a/rust/bioscript-reporting/src/manifest.rs +++ b/rust/bioscript-reporting/src/manifest.rs @@ -1,4 +1,5 @@ use std::{ + collections::BTreeSet, fs, path::{Path, PathBuf}, }; @@ -284,6 +285,7 @@ pub fn collect_variant_manifest_tasks( } } } + dedupe_variant_manifest_tasks(&mut tasks); Ok(tasks) } ReportManifestKind::Assay => { @@ -305,11 +307,17 @@ pub fn collect_variant_manifest_tasks( } } } + dedupe_variant_manifest_tasks(&mut tasks); Ok(tasks) } } } +fn dedupe_variant_manifest_tasks(tasks: &mut Vec) { + let mut seen = BTreeSet::new(); + tasks.retain(|task| seen.insert(task.manifest_path.clone())); +} + fn load_variant_task( workspace: &impl ManifestWorkspace, path: &str, @@ -804,6 +812,62 @@ analyses: assert_eq!(variant_tasks[0].manifest.name, "rs1"); } + #[test] + fn collect_variant_manifest_tasks_dedupes_catalogue_reached_through_panel_and_assay() { + let workspace = MapWorkspace { + files: BTreeMap::from([ + ( + "panel.yaml".to_owned(), + r#" +schema: bioscript:panel:1.0 +version: "1.0" +name: panel +members: + - kind: variant-catalogue + path: catalogue.yaml + - kind: assay + path: assay.yaml +"# + .to_owned(), + ), + ( + "assay.yaml".to_owned(), + r#" +schema: bioscript:assay:1.0 +version: "1.0" +name: assay +members: + - kind: variant-catalogue + path: catalogue.yaml +"# + .to_owned(), + ), + ( + "catalogue.yaml".to_owned(), + r#" +schema: bioscript:variant-catalogue:1.0 +version: "1.0" +name: catalogue +variants: + source: variants.tsv +"# + .to_owned(), + ), + ( + "variants.tsv".to_owned(), + r#"id name rsid gene ref alt kind grch38_chrom grch38_pos +rs1 rs1 rs1 GENE A G snp 1 123 +"# + .to_owned(), + ), + ]), + }; + + let tasks = collect_variant_manifest_tasks(&workspace, "panel.yaml", &[]).unwrap(); + assert_eq!(tasks.len(), 1); + assert_eq!(tasks[0].manifest_path, "catalogue.yaml#rs1"); + } + #[test] fn manifest_context_and_findings_follow_includes_members_and_inherited_bindings() { let workspace = MapWorkspace { diff --git a/rust/bioscript-wasm/src/report_api.rs b/rust/bioscript-wasm/src/report_api.rs index c0d1299..7337b11 100644 --- a/rust/bioscript-wasm/src/report_api.rs +++ b/rust/bioscript-wasm/src/report_api.rs @@ -28,7 +28,8 @@ mod report_workspace; use report_helpers::*; use report_input_inspection::{ - explicit_sex_from_options, inspect_head_via_js_reader, vcf_sex_via_js_reader, + detect_vcf_assembly_via_js_reader, explicit_sex_from_options, inspect_head_via_js_reader, + vcf_sex_via_js_reader, }; use report_lookup::{BamReportLookup, CramReportLookup, VcfReportLookup}; use report_workspace::PackageWorkspace; @@ -388,6 +389,10 @@ pub fn run_package_report_from_vcf( &options.inspect_options(false), false, ); + if head_inspection.assembly.is_none() { + head_inspection.assembly = + detect_vcf_assembly_via_js_reader(vcf_read_at.clone(), vcf_len as u64, input_name); + } let tabix_index = bioscript_formats::alignment::parse_tbi_bytes(tbi_bytes) .map_err(|err| JsError::new(&format!("parse tbi: {err:?}")))?; let vcf_reader = JsReader::new(vcf_read_at.clone(), vcf_len as u64, "vcf"); diff --git a/rust/bioscript-wasm/src/report_input_inspection.rs b/rust/bioscript-wasm/src/report_input_inspection.rs index 3dfc5d8..a646ace 100644 --- a/rust/bioscript-wasm/src/report_input_inspection.rs +++ b/rust/bioscript-wasm/src/report_input_inspection.rs @@ -54,6 +54,35 @@ pub(super) fn inspect_head_via_js_reader( } } +pub(super) fn detect_vcf_assembly_via_js_reader( + read_at: js_sys::Function, + total_len: u64, + input_name: &str, +) -> Option { + use crate::js_reader::JsReader; + use std::{ + io::{BufRead, BufReader}, + path::Path, + }; + + let reader = JsReader::new(read_at, total_len, "vcf-assembly"); + let bgzf_reader = noodles::bgzf::io::Reader::new(reader); + let mut lines = BufReader::new(bgzf_reader).lines(); + let mut probe_lines = Vec::new(); + for _ in 0..256 { + let line = match lines.next() { + Some(Ok(line)) => line, + Some(Err(_)) | None => break, + }; + let reached_samples = line.starts_with("#CHROM"); + probe_lines.push(line); + if reached_samples { + break; + } + } + bioscript_formats::detect_vcf_assembly(Path::new(input_name), &probe_lines) +} + /// Build the `SexInference` the CLI produces when `--sample-sex` is passed, /// without dragging in the bioscript-cli crate. Mirrors /// `bioscript_cli::report_options::explicit_sample_sex_inference`. diff --git a/rust/bioscript-wasm/src/report_workspace.rs b/rust/bioscript-wasm/src/report_workspace.rs index e3821c3..c5cde93 100644 --- a/rust/bioscript-wasm/src/report_workspace.rs +++ b/rust/bioscript-wasm/src/report_workspace.rs @@ -40,7 +40,6 @@ impl PackageWorkspace { load_variant_manifest_text(path, self.text(path)?) .map_err(|err| JsError::new(&format!("load variant {path}: {err}"))) } - } impl bioscript_reporting::ManifestWorkspace for PackageWorkspace { From 5b69fecbdaf39fadcd984d00e1a8263c47608b19 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Fri, 15 May 2026 15:15:13 +1000 Subject: [PATCH 3/3] fixing linting --- rust/bioscript-formats/src/genotype.rs | 66 +------- rust/bioscript-formats/src/genotype/cache.rs | 65 ++++++++ rust/bioscript-reporting/src/html.rs | 2 +- rust/bioscript-runtime/src/runtime.rs | 144 +----------------- rust/bioscript-runtime/src/runtime/paths.rs | 150 +++++++++++++++++++ 5 files changed, 221 insertions(+), 206 deletions(-) create mode 100644 rust/bioscript-formats/src/genotype/cache.rs create mode 100644 rust/bioscript-runtime/src/runtime/paths.rs diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 3999287..8f4eb16 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -10,6 +10,7 @@ use zip::ZipArchive; use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; mod backends; +mod cache; mod common; mod cram_backend; mod delimited; @@ -19,6 +20,7 @@ mod types; mod vcf; mod vcf_tokens; +pub(crate) use cache::{match_cached_observation, required_cache_miss}; pub(crate) use common::{describe_query, normalize_genotype, variant_sort_key}; pub use cram_backend::{ observe_cram_deletion_with_reader, observe_cram_indel_with_reader, observe_cram_snp_with_reader, @@ -432,70 +434,6 @@ impl GenotypeStore { } } -/// Match a `VariantSpec` against a pre-resolved observation list. Tries rsid -/// equality first (most common case for `PGx` panels), then falls back to a -/// chrom+pos+ref+alt match against either `GRCh37` or `GRCh38` loci so cached -/// observations from a CRAM lookup (which may have been done on one assembly) -/// can satisfy a script that supplies the spec on the other. -fn match_cached_observation<'a>( - observations: &'a [VariantObservation], - spec: &VariantSpec, -) -> Option<&'a VariantObservation> { - if let Some(matched) = observations.iter().find(|obs| { - obs.matched_rsid - .as_deref() - .is_some_and(|rsid| spec.rsids.iter().any(|target| target == rsid)) - }) { - return Some(matched); - } - let assembly_loci = [spec.grch37.as_ref(), spec.grch38.as_ref()] - .into_iter() - .flatten() - .collect::>(); - let target_ref = spec.reference.as_deref(); - let target_alt = spec.alternate.as_deref(); - observations.iter().find(|obs| { - let evidence_match = assembly_loci.iter().any(|loci| { - obs.evidence - .iter() - .any(|line| line.contains(&loci.chrom) && line.contains(&loci.start.to_string())) - }); - if !evidence_match { - return false; - } - match (target_ref, target_alt) { - (Some(r), Some(a)) => obs - .evidence - .iter() - .any(|line| line.contains(r) && line.contains(a)), - _ => true, - } - }) -} - -fn required_cache_miss(spec: &VariantSpec) -> RuntimeError { - let rsids = if spec.rsids.is_empty() { - "".to_owned() - } else { - spec.rsids.join("|") - }; - let loci = [ - spec.grch37 - .as_ref() - .map(|locus| format!("grch37:{}:{}-{}", locus.chrom, locus.start, locus.end)), - spec.grch38 - .as_ref() - .map(|locus| format!("grch38:{}:{}-{}", locus.chrom, locus.start, locus.end)), - ] - .into_iter() - .flatten() - .collect::>() - .join(","); - RuntimeError::InvalidArguments(format!( - "required preloaded genotype observation missing for rsids={rsids} loci={loci}" - )) -} - fn bytes_look_like_zip(bytes: &[u8]) -> bool { bytes.starts_with(b"PK\x03\x04") || bytes.starts_with(b"PK\x05\x06") diff --git a/rust/bioscript-formats/src/genotype/cache.rs b/rust/bioscript-formats/src/genotype/cache.rs new file mode 100644 index 0000000..e133b29 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/cache.rs @@ -0,0 +1,65 @@ +use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; + +/// Match a `VariantSpec` against a pre-resolved observation list. Tries rsid +/// equality first (most common case for `PGx` panels), then falls back to a +/// chrom+pos+ref+alt match against either `GRCh37` or `GRCh38` loci so cached +/// observations from a CRAM lookup (which may have been done on one assembly) +/// can satisfy a script that supplies the spec on the other. +pub(crate) fn match_cached_observation<'a>( + observations: &'a [VariantObservation], + spec: &VariantSpec, +) -> Option<&'a VariantObservation> { + if let Some(matched) = observations.iter().find(|obs| { + obs.matched_rsid + .as_deref() + .is_some_and(|rsid| spec.rsids.iter().any(|target| target == rsid)) + }) { + return Some(matched); + } + let assembly_loci = [spec.grch37.as_ref(), spec.grch38.as_ref()] + .into_iter() + .flatten() + .collect::>(); + let target_ref = spec.reference.as_deref(); + let target_alt = spec.alternate.as_deref(); + observations.iter().find(|obs| { + let evidence_match = assembly_loci.iter().any(|loci| { + obs.evidence + .iter() + .any(|line| line.contains(&loci.chrom) && line.contains(&loci.start.to_string())) + }); + if !evidence_match { + return false; + } + match (target_ref, target_alt) { + (Some(r), Some(a)) => obs + .evidence + .iter() + .any(|line| line.contains(r) && line.contains(a)), + _ => true, + } + }) +} + +pub(crate) fn required_cache_miss(spec: &VariantSpec) -> RuntimeError { + let rsids = if spec.rsids.is_empty() { + "".to_owned() + } else { + spec.rsids.join("|") + }; + let loci = [ + spec.grch37 + .as_ref() + .map(|locus| format!("grch37:{}:{}-{}", locus.chrom, locus.start, locus.end)), + spec.grch38 + .as_ref() + .map(|locus| format!("grch38:{}:{}-{}", locus.chrom, locus.start, locus.end)), + ] + .into_iter() + .flatten() + .collect::>() + .join(","); + RuntimeError::InvalidArguments(format!( + "required preloaded genotype observation missing for rsids={rsids} loci={loci}" + )) +} diff --git a/rust/bioscript-reporting/src/html.rs b/rust/bioscript-reporting/src/html.rs index 78f39e3..5328c8a 100644 --- a/rust/bioscript-reporting/src/html.rs +++ b/rust/bioscript-reporting/src/html.rs @@ -23,7 +23,7 @@ pub fn render_app_html_document( reports: &[serde_json::Value], ) -> Result { let mut out = String::from( - r##"BioScript report
"##, + r#"BioScript report
"#, ); out.push_str( r"", diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index 01f4a7d..a7c6d09 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -1,7 +1,7 @@ use std::{ collections::BTreeMap, fs, - path::{Component, Path, PathBuf}, + path::{Path, PathBuf}, sync::Arc, time::Duration, }; @@ -13,6 +13,7 @@ mod args; mod host_io; mod methods; mod objects; +mod paths; mod state; mod timing; mod trace; @@ -26,6 +27,7 @@ use objects::bioscript_object; use objects::{ genotype_file_object, variant_object, variant_observation_object, variant_plan_object, }; +pub(crate) use paths::resolve_optional_loader_path; pub use state::{RuntimeConfig, StageTiming}; use state::{RuntimeState, monty_error}; use timing::RuntimeInstant; @@ -289,88 +291,6 @@ impl BioscriptRuntime { }); } - fn resolve_user_path(&self, raw_path: &str) -> Result { - let path = Path::new(raw_path); - if path_is_rooted(path) { - if self.uses_virtual_files() { - for component in path.components() { - match component { - Component::ParentDir | Component::Prefix(_) => { - return Err(RuntimeError::InvalidArguments(format!( - "path escapes bioscript root: {raw_path}" - ))); - } - Component::RootDir | Component::CurDir | Component::Normal(_) => {} - } - } - return Ok(path.to_path_buf()); - } - return Err(RuntimeError::InvalidArguments(format!( - "absolute paths are not allowed: {raw_path}" - ))); - } - for component in path.components() { - match component { - Component::ParentDir | Component::RootDir | Component::Prefix(_) => { - return Err(RuntimeError::InvalidArguments(format!( - "path escapes bioscript root: {raw_path}" - ))); - } - Component::CurDir | Component::Normal(_) => {} - } - } - Ok(self.root.join(path)) - } - - fn resolve_existing_user_path(&self, raw_path: &str) -> Result { - let path = self.resolve_user_path(raw_path)?; - if self.virtual_file_exists(raw_path) { - return Ok(path); - } - if self.uses_virtual_files() && path_is_rooted(Path::new(raw_path)) { - return Err(RuntimeError::Io(format!( - "virtual file does not exist: {raw_path}" - ))); - } - let canonical = path.canonicalize().map_err(|err| { - RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) - })?; - self.ensure_under_root(&canonical, raw_path)?; - Ok(canonical) - } - - fn resolve_user_write_path(&self, raw_path: &str) -> Result { - let path = self.resolve_user_path(raw_path)?; - if self.uses_virtual_files() { - if path_is_rooted(Path::new(raw_path)) - && !(raw_path.starts_with("/output/") || raw_path.starts_with("/work/")) - { - return Err(RuntimeError::InvalidArguments(format!( - "virtual write path must be under /work or /output: {raw_path}" - ))); - } - return Ok(path); - } - if path.exists() { - let canonical = path.canonicalize().map_err(|err| { - RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) - })?; - self.ensure_under_root(&canonical, raw_path)?; - return Ok(canonical); - } - - let parent = path.parent().unwrap_or(&self.root); - let existing_parent = deepest_existing_ancestor(parent); - let canonical_parent = existing_parent.canonicalize().map_err(|err| { - RuntimeError::Io(format!( - "failed to resolve parent dir {}: {err}", - existing_parent.display() - )) - })?; - self.ensure_under_root(&canonical_parent, raw_path)?; - Ok(path) - } - #[must_use] pub fn virtual_written_text_files(&self) -> BTreeMap { self.state @@ -414,39 +334,6 @@ impl BioscriptRuntime { self.config.virtual_binary_files.get(&key).cloned() } - fn uses_virtual_files(&self) -> bool { - !self.config.virtual_text_files.is_empty() || !self.config.virtual_binary_files.is_empty() - } - - fn virtual_file_exists(&self, raw_path: &str) -> bool { - self.config.virtual_text_files.contains_key(raw_path) - || self.config.virtual_binary_files.contains_key(raw_path) - || self - .state - .virtual_written_text_files - .lock() - .expect("virtual file mutex poisoned") - .contains_key(raw_path) - } - - fn virtual_key(&self, path: &Path) -> String { - path.strip_prefix(&self.root) - .unwrap_or(path) - .display() - .to_string() - .replace('\\', "/") - } - - fn ensure_under_root(&self, path: &Path, raw_path: &str) -> Result<(), RuntimeError> { - if path.starts_with(&self.root) { - Ok(()) - } else { - Err(RuntimeError::InvalidArguments(format!( - "path escapes bioscript root: {raw_path}" - ))) - } - } - fn write_trace_report( &self, report_path: &Path, @@ -489,31 +376,6 @@ impl BioscriptRuntime { } } -// Treat any path with a leading root (`/...`) as rooted/absolute. We cannot use -// `Path::is_absolute()` here: on `wasm32-unknown-unknown` (the wasm-pack build -// target) `is_absolute()` is gated to `unix`/`wasi` and always returns `false` -// for `/`-rooted paths, which would route the virtual report paths -// (`/input/...`, `/work/...`, `/output/...`) into the relative-path branch and -// reject their `RootDir` component as escaping the bioscript root. -// `Path::has_root()` parses components and is target-independent. -fn path_is_rooted(path: &Path) -> bool { - path.has_root() -} - -fn resolve_optional_loader_path( - runtime: &BioscriptRuntime, - path: Option, -) -> Result, RuntimeError> { - path.map(|path| { - if path_is_rooted(&path) { - Ok(path) - } else { - runtime.resolve_user_path(&path.to_string_lossy()) - } - }) - .transpose() -} - #[cfg(test)] mod tests { use super::*; diff --git a/rust/bioscript-runtime/src/runtime/paths.rs b/rust/bioscript-runtime/src/runtime/paths.rs new file mode 100644 index 0000000..031dd94 --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/paths.rs @@ -0,0 +1,150 @@ +use std::path::{Component, Path, PathBuf}; + +use bioscript_core::RuntimeError; + +use super::{BioscriptRuntime, deepest_existing_ancestor}; + +impl BioscriptRuntime { + pub(super) fn resolve_user_path(&self, raw_path: &str) -> Result { + let path = Path::new(raw_path); + if path_is_rooted(path) { + if self.uses_virtual_files() { + for component in path.components() { + match component { + Component::ParentDir | Component::Prefix(_) => { + return Err(RuntimeError::InvalidArguments(format!( + "path escapes bioscript root: {raw_path}" + ))); + } + Component::RootDir | Component::CurDir | Component::Normal(_) => {} + } + } + return Ok(path.to_path_buf()); + } + return Err(RuntimeError::InvalidArguments(format!( + "absolute paths are not allowed: {raw_path}" + ))); + } + for component in path.components() { + match component { + Component::ParentDir | Component::RootDir | Component::Prefix(_) => { + return Err(RuntimeError::InvalidArguments(format!( + "path escapes bioscript root: {raw_path}" + ))); + } + Component::CurDir | Component::Normal(_) => {} + } + } + Ok(self.root.join(path)) + } + + pub(super) fn resolve_existing_user_path( + &self, + raw_path: &str, + ) -> Result { + let path = self.resolve_user_path(raw_path)?; + if self.virtual_file_exists(raw_path) { + return Ok(path); + } + if self.uses_virtual_files() && path_is_rooted(Path::new(raw_path)) { + return Err(RuntimeError::Io(format!( + "virtual file does not exist: {raw_path}" + ))); + } + let canonical = path.canonicalize().map_err(|err| { + RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) + })?; + self.ensure_under_root(&canonical, raw_path)?; + Ok(canonical) + } + + pub(super) fn resolve_user_write_path(&self, raw_path: &str) -> Result { + let path = self.resolve_user_path(raw_path)?; + if self.uses_virtual_files() { + if path_is_rooted(Path::new(raw_path)) + && !(raw_path.starts_with("/output/") || raw_path.starts_with("/work/")) + { + return Err(RuntimeError::InvalidArguments(format!( + "virtual write path must be under /work or /output: {raw_path}" + ))); + } + return Ok(path); + } + if path.exists() { + let canonical = path.canonicalize().map_err(|err| { + RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) + })?; + self.ensure_under_root(&canonical, raw_path)?; + return Ok(canonical); + } + + let parent = path.parent().unwrap_or(&self.root); + let existing_parent = deepest_existing_ancestor(parent); + let canonical_parent = existing_parent.canonicalize().map_err(|err| { + RuntimeError::Io(format!( + "failed to resolve parent dir {}: {err}", + existing_parent.display() + )) + })?; + self.ensure_under_root(&canonical_parent, raw_path)?; + Ok(path) + } + + pub(super) fn uses_virtual_files(&self) -> bool { + !self.config.virtual_text_files.is_empty() || !self.config.virtual_binary_files.is_empty() + } + + fn virtual_file_exists(&self, raw_path: &str) -> bool { + self.config.virtual_text_files.contains_key(raw_path) + || self.config.virtual_binary_files.contains_key(raw_path) + || self + .state + .virtual_written_text_files + .lock() + .expect("virtual file mutex poisoned") + .contains_key(raw_path) + } + + pub(super) fn virtual_key(&self, path: &Path) -> String { + path.strip_prefix(&self.root) + .unwrap_or(path) + .display() + .to_string() + .replace('\\', "/") + } + + fn ensure_under_root(&self, path: &Path, raw_path: &str) -> Result<(), RuntimeError> { + if path.starts_with(&self.root) { + Ok(()) + } else { + Err(RuntimeError::InvalidArguments(format!( + "path escapes bioscript root: {raw_path}" + ))) + } + } +} + +// Treat any path with a leading root (`/...`) as rooted/absolute. We cannot use +// `Path::is_absolute()` here: on `wasm32-unknown-unknown` (the wasm-pack build +// target) `is_absolute()` is gated to `unix`/`wasi` and always returns `false` +// for `/`-rooted paths, which would route the virtual report paths +// (`/input/...`, `/work/...`, `/output/...`) into the relative-path branch and +// reject their `RootDir` component as escaping the bioscript root. +// `Path::has_root()` parses components and is target-independent. +fn path_is_rooted(path: &Path) -> bool { + path.has_root() +} + +pub(crate) fn resolve_optional_loader_path( + runtime: &BioscriptRuntime, + path: Option, +) -> Result, RuntimeError> { + path.map(|path| { + if path_is_rooted(&path) { + Ok(path) + } else { + runtime.resolve_user_path(&path.to_string_lossy()) + } + }) + .transpose() +}