From eefa096d0cb3933efce55bf2315a4b377715d796 Mon Sep 17 00:00:00 2001 From: Martin Mauch Date: Wed, 26 Nov 2025 23:46:02 +0100 Subject: [PATCH 1/4] fix: Lineage traces through CTEs to source tables Previously, CTEs (let statements) were treated as opaque tables in lineage tracking. The lineage would show `table: [cte_name]` instead of tracing back to the actual source tables. Now, lineage properly traces through CTEs to their underlying source tables. For simple CTEs, this shows the original table. For CTEs with UNIONs or JOINs, all source tables are included in the lineage. --- .../prqlc/src/semantic/resolver/inference.rs | 49 ++++++++++++++++--- prqlc/prqlc/src/semantic/resolver/mod.rs | 49 +++++++++++++++++++ ..._queries__debug_lineage__genre_counts.snap | 6 ++- 3 files changed, 96 insertions(+), 8 deletions(-) diff --git a/prqlc/prqlc/src/semantic/resolver/inference.rs b/prqlc/prqlc/src/semantic/resolver/inference.rs index 336d5ade30b6..9ca68e2711f0 100644 --- a/prqlc/prqlc/src/semantic/resolver/inference.rs +++ b/prqlc/prqlc/src/semantic/resolver/inference.rs @@ -71,17 +71,54 @@ impl Resolver<'_> { input_id: usize, ) -> Lineage { let table_decl = self.root_mod.module.get(table_fq).unwrap(); - let TableDecl { ty, .. } = table_decl.kind.as_table_decl().unwrap(); + let TableDecl { ty, expr } = table_decl.kind.as_table_decl().unwrap(); + + // For CTEs (RelationVar), trace lineage back to the underlying source tables. + // We preserve the underlying inputs' `table` fields (which point to the actual + // source tables like `default_db.employees`) but use the CTE's name for the + // `name` field (used for column references in SQL generation). + // + // For UNIONs and JOINs, this means we include all underlying source tables. + let inputs = if let TableExpr::RelationVar(relation_expr) = expr { + if let Some(underlying_lineage) = &relation_expr.lineage { + if underlying_lineage.inputs.is_empty() { + vec![LineageInput { + id: input_id, + name: input_name.clone(), + table: table_fq.clone(), + }] + } else { + // Trace back to all underlying source tables + underlying_lineage + .inputs + .iter() + .map(|inp| LineageInput { + id: input_id, + name: input_name.clone(), + table: inp.table.clone(), + }) + .collect() + } + } else { + vec![LineageInput { + id: input_id, + name: input_name.clone(), + table: table_fq.clone(), + }] + } + } else { + vec![LineageInput { + id: input_id, + name: input_name.clone(), + table: table_fq.clone(), + }] + }; // TODO: can this panic? let columns = ty.as_ref().unwrap().as_relation().unwrap(); let mut instance_frame = Lineage { - inputs: vec![LineageInput { - id: input_id, - name: input_name.clone(), - table: table_fq.clone(), - }], + inputs, columns: Vec::new(), ..Default::default() }; diff --git a/prqlc/prqlc/src/semantic/resolver/mod.rs b/prqlc/prqlc/src/semantic/resolver/mod.rs index 5c774a567829..0669891f5c9c 100644 --- a/prqlc/prqlc/src/semantic/resolver/mod.rs +++ b/prqlc/prqlc/src/semantic/resolver/mod.rs @@ -481,4 +481,53 @@ pub(super) mod test { _ => panic!("Expected All column"), } } + + #[test] + fn test_cte_lineage_with_union_traces_to_all_source_tables() { + // This test verifies that CTEs with UNIONs trace lineage + // back to ALL underlying source tables. + use crate::internal::pl_to_lineage; + + let query = r#" + let combined = ( + from employees + select {name, dept} + append ( + from contractors + select {name, dept} + ) + ) + from combined + select {name} + "#; + + let pl = crate::prql_to_pl(query).unwrap(); + let fc = pl_to_lineage(pl).unwrap(); + let final_lineage = &fc.frames.last().unwrap().1; + + // Should have inputs from both employees and contractors + assert_eq!( + final_lineage.inputs.len(), + 2, + "CTE with UNION should have 2 inputs, got {:?}", + final_lineage.inputs + ); + + let tables: Vec<_> = final_lineage + .inputs + .iter() + .map(|inp| inp.table.name.as_str()) + .collect(); + + assert!( + tables.contains(&"employees"), + "Should contain employees table, got {:?}", + tables + ); + assert!( + tables.contains(&"contractors"), + "Should contain contractors table, got {:?}", + tables + ); + } } diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__debug_lineage__genre_counts.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__debug_lineage__genre_counts.snap index 8ba9ac7871bd..b7f88e09376a 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__debug_lineage__genre_counts.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__debug_lineage__genre_counts.snap @@ -16,7 +16,8 @@ frames: - id: 133 name: genre_count table: - - genre_count + - default_db + - genres - - 1:217-230 - columns: - !Single @@ -28,7 +29,8 @@ frames: - id: 133 name: genre_count table: - - genre_count + - default_db + - genres nodes: - id: 133 kind: Ident From bf1e03356d7117991f874c0c7bc4a9e3afb9ac9d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Nov 2025 22:48:07 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- prqlc/prqlc/src/semantic/resolver/inference.rs | 2 +- prqlc/prqlc/src/semantic/resolver/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/prqlc/prqlc/src/semantic/resolver/inference.rs b/prqlc/prqlc/src/semantic/resolver/inference.rs index 9ca68e2711f0..1305fecc0d6b 100644 --- a/prqlc/prqlc/src/semantic/resolver/inference.rs +++ b/prqlc/prqlc/src/semantic/resolver/inference.rs @@ -78,7 +78,7 @@ impl Resolver<'_> { // source tables like `default_db.employees`) but use the CTE's name for the // `name` field (used for column references in SQL generation). // - // For UNIONs and JOINs, this means we include all underlying source tables. + // For UNIONNs and JOINs, this means we include all underlying source tables. let inputs = if let TableExpr::RelationVar(relation_expr) = expr { if let Some(underlying_lineage) = &relation_expr.lineage { if underlying_lineage.inputs.is_empty() { diff --git a/prqlc/prqlc/src/semantic/resolver/mod.rs b/prqlc/prqlc/src/semantic/resolver/mod.rs index 0669891f5c9c..b86e9282eb93 100644 --- a/prqlc/prqlc/src/semantic/resolver/mod.rs +++ b/prqlc/prqlc/src/semantic/resolver/mod.rs @@ -484,7 +484,7 @@ pub(super) mod test { #[test] fn test_cte_lineage_with_union_traces_to_all_source_tables() { - // This test verifies that CTEs with UNIONs trace lineage + // This test verifies that CTEs with UNIONNs trace lineage // back to ALL underlying source tables. use crate::internal::pl_to_lineage; From 7e14d30383b17efaef0f2689349cb1419176f01e Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Wed, 26 Nov 2025 21:06:40 -0800 Subject: [PATCH 3/4] refactor: simplify lineage tracing and add missing test - Simplify lineage_of_table_decl from nested if-let to two match expressions - Add UNIO to typos allowlist (prevents UNIONs -> UNIONNs corruption) - Add test_cte_lineage_traces_to_source_table mentioned in PR description Co-Authored-By: Claude --- .typos.toml | 3 ++ .../prqlc/src/semantic/resolver/inference.rs | 48 ++++++------------- prqlc/prqlc/src/semantic/resolver/mod.rs | 35 ++++++++++++++ 3 files changed, 53 insertions(+), 33 deletions(-) diff --git a/.typos.toml b/.typos.toml index 375991d026fd..7927fae6e56a 100644 --- a/.typos.toml +++ b/.typos.toml @@ -22,3 +22,6 @@ wee = "wee" flate = "flate" # distinct_ons is correct (SQL DISTINCT ON) ons = "ons" +# SQL UNIONs - typos incorrectly thinks UNIO should be UNION +# This happens because typos tokenizes UNIONs and finds UNIO +UNIO = "UNIO" diff --git a/prqlc/prqlc/src/semantic/resolver/inference.rs b/prqlc/prqlc/src/semantic/resolver/inference.rs index 1305fecc0d6b..36530f3378c8 100644 --- a/prqlc/prqlc/src/semantic/resolver/inference.rs +++ b/prqlc/prqlc/src/semantic/resolver/inference.rs @@ -74,44 +74,26 @@ impl Resolver<'_> { let TableDecl { ty, expr } = table_decl.kind.as_table_decl().unwrap(); // For CTEs (RelationVar), trace lineage back to the underlying source tables. - // We preserve the underlying inputs' `table` fields (which point to the actual - // source tables like `default_db.employees`) but use the CTE's name for the - // `name` field (used for column references in SQL generation). - // - // For UNIONNs and JOINs, this means we include all underlying source tables. - let inputs = if let TableExpr::RelationVar(relation_expr) = expr { - if let Some(underlying_lineage) = &relation_expr.lineage { - if underlying_lineage.inputs.is_empty() { - vec![LineageInput { - id: input_id, - name: input_name.clone(), - table: table_fq.clone(), - }] - } else { - // Trace back to all underlying source tables - underlying_lineage - .inputs - .iter() - .map(|inp| LineageInput { - id: input_id, - name: input_name.clone(), - table: inp.table.clone(), - }) - .collect() - } - } else { - vec![LineageInput { + // For UNIONs and JOINs, this includes all underlying source tables. + let underlying_inputs = match expr { + TableExpr::RelationVar(rel) => rel.lineage.as_ref().map(|l| &l.inputs), + _ => None, + }; + + let inputs = match underlying_inputs { + Some(inputs) if !inputs.is_empty() => inputs + .iter() + .map(|inp| LineageInput { id: input_id, name: input_name.clone(), - table: table_fq.clone(), - }] - } - } else { - vec![LineageInput { + table: inp.table.clone(), + }) + .collect(), + _ => vec![LineageInput { id: input_id, name: input_name.clone(), table: table_fq.clone(), - }] + }], }; // TODO: can this panic? diff --git a/prqlc/prqlc/src/semantic/resolver/mod.rs b/prqlc/prqlc/src/semantic/resolver/mod.rs index b86e9282eb93..24bb8ca3da55 100644 --- a/prqlc/prqlc/src/semantic/resolver/mod.rs +++ b/prqlc/prqlc/src/semantic/resolver/mod.rs @@ -482,6 +482,41 @@ pub(super) mod test { } } + #[test] + fn test_cte_lineage_traces_to_source_table() { + // This test verifies that simple CTEs trace lineage back to + // the underlying source table instead of showing the CTE name. + use crate::internal::pl_to_lineage; + + let query = r#" + let employees_usa = (from employees | filter country == "USA") + from employees_usa + select {name, salary} + "#; + + let pl = crate::prql_to_pl(query).unwrap(); + let fc = pl_to_lineage(pl).unwrap(); + let final_lineage = &fc.frames.last().unwrap().1; + + assert_eq!( + final_lineage.inputs.len(), + 1, + "Simple CTE should have 1 input, got {:?}", + final_lineage.inputs + ); + + let input = &final_lineage.inputs[0]; + assert_eq!( + input.name, "employees_usa", + "Input name should be the CTE alias" + ); + assert_eq!( + input.table.name, "employees", + "Table should trace back to source table 'employees', got {:?}", + input.table + ); + } + #[test] fn test_cte_lineage_with_union_traces_to_all_source_tables() { // This test verifies that CTEs with UNIONNs trace lineage From b6fd4799623ac25be9da4b7001993e4f47aef13a Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Wed, 26 Nov 2025 21:18:08 -0800 Subject: [PATCH 4/4] test: add coverage for direct table lineage fallback path - Add test_direct_table_lineage_uses_table_itself to exercise the non-CTE code path in lineage_of_table_decl (line 80, 92-96 in inference.rs) - Fix UNIONNs typo in test comment Co-Authored-By: Claude --- prqlc/prqlc/src/semantic/resolver/mod.rs | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/prqlc/prqlc/src/semantic/resolver/mod.rs b/prqlc/prqlc/src/semantic/resolver/mod.rs index 24bb8ca3da55..3453dd7cb13e 100644 --- a/prqlc/prqlc/src/semantic/resolver/mod.rs +++ b/prqlc/prqlc/src/semantic/resolver/mod.rs @@ -517,9 +517,38 @@ pub(super) mod test { ); } + #[test] + fn test_direct_table_lineage_uses_table_itself() { + // This test verifies that direct table references (non-CTEs) + // use the table itself as the lineage input, exercising the + // fallback path in lineage_of_table_decl. + use crate::internal::pl_to_lineage; + + let query = r#" + from employees + select {name, salary} + "#; + + let pl = crate::prql_to_pl(query).unwrap(); + let fc = pl_to_lineage(pl).unwrap(); + let final_lineage = &fc.frames.last().unwrap().1; + + assert_eq!( + final_lineage.inputs.len(), + 1, + "Direct table should have 1 input" + ); + + let input = &final_lineage.inputs[0]; + assert_eq!( + input.table.name, "employees", + "Table should be 'employees' directly" + ); + } + #[test] fn test_cte_lineage_with_union_traces_to_all_source_tables() { - // This test verifies that CTEs with UNIONNs trace lineage + // This test verifies that CTEs with UNIONs trace lineage // back to ALL underlying source tables. use crate::internal::pl_to_lineage;