From 5ac07d7740a12d8e388b7e664da9c1afc1975e3d Mon Sep 17 00:00:00 2001 From: tobymao Date: Wed, 15 Feb 2023 16:13:40 -0800 Subject: [PATCH 1/3] optimize everything even without schema --- sqlmesh/core/model/definition.py | 5 ++- sqlmesh/core/renderer.py | 30 ++++++---------- tests/core/test_model.py | 62 ++++++++++++++++---------------- tests/core/test_snapshot.py | 38 ++++++++++---------- tests/dbt/test_config.py | 6 ++-- 5 files changed, 66 insertions(+), 75 deletions(-) diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py index 3175d9a206..cf31bb7a08 100644 --- a/sqlmesh/core/model/definition.py +++ b/sqlmesh/core/model/definition.py @@ -11,7 +11,6 @@ from astor import to_source from pydantic import Field from sqlglot import exp -from sqlglot.optimizer.annotate_types import annotate_types from sqlglot.optimizer.scope import traverse_scope from sqlglot.schema import MappingSchema from sqlglot.time import format_time @@ -575,9 +574,9 @@ def columns_to_types(self) -> t.Dict[str, exp.DataType]: return self.columns_to_types_ if self._columns_to_types is None: - query = annotate_types(self._query_renderer.render()) self._columns_to_types = { - expression.alias_or_name: expression.type for expression in query.expressions + expression.alias_or_name: expression.type + for expression in self._query_renderer.render().expressions } return self._columns_to_types diff --git a/sqlmesh/core/renderer.py b/sqlmesh/core/renderer.py index 595fa14d08..af1d4df71a 100644 --- a/sqlmesh/core/renderer.py +++ b/sqlmesh/core/renderer.py @@ -6,7 +6,7 @@ from jinja2 import Environment from sqlglot import exp, parse_one -from sqlglot.errors import SchemaError, SqlglotError +from sqlglot.errors import OptimizeError, SchemaError, SqlglotError from sqlglot.optimizer import optimize from sqlglot.optimizer.annotate_types import annotate_types from sqlglot.optimizer.expand_laterals import expand_laterals @@ -141,24 +141,16 @@ def render( except MacroEvalError as ex: raise_config_error(f"Failed to resolve macro for query. {ex}", self._path) - if self._schema: - # This takes care of expanding star projections - - try: - self._query_cache[cache_key] = optimize( - self._query_cache[cache_key], - schema=self._schema, - rules=RENDER_OPTIMIZER_RULES, - ) - except SchemaError: - pass - except SqlglotError as ex: - raise_config_error(f"Invalid model query. {ex}", self._path) - - self._columns_to_types = { - expression.alias_or_name: expression.type - for expression in self._query_cache[cache_key].expressions - } + try: + self._query_cache[cache_key] = optimize( + self._query_cache[cache_key], + schema=self._schema, + rules=RENDER_OPTIMIZER_RULES, + ) + except (SchemaError, OptimizeError): + pass + except SqlglotError as ex: + raise_config_error(f"Invalid model query. {ex}", self._path) query = self._query_cache[cache_key] diff --git a/tests/core/test_model.py b/tests/core/test_model.py index 84f2c94538..9b12c3b8f9 100644 --- a/tests/core/test_model.py +++ b/tests/core/test_model.py @@ -102,10 +102,7 @@ def test_load(assert_exp_eq): @pytest.mark.parametrize( "query, error", [ - ("sum(x)::int", "must have inferrable names"), - ("CAST(x + 1 AS INT)", "must have inferrable names"), ("y::int, x::int AS y", "duplicate"), - ("sum(x)::int -- annotation", "must have inferrable names"), ], ) def test_model_validation(query, error): @@ -113,6 +110,7 @@ def test_model_validation(query, error): f""" MODEL ( name db.table, + kind FULL, ); SELECT {query} @@ -473,25 +471,27 @@ def test_render_query(assert_exp_eq): assert_exp_eq( model.render_query(start="2020-10-28", end="2020-10-28"), """ - SELECT y - FROM x + SELECT + x.y AS y + FROM x AS x WHERE - y <= '2020-10-28' - AND y <= TIME_STR_TO_TIME('2020-10-28T23:59:59.999000+00:00') - AND y >= '2020-10-28' - AND y >= TIME_STR_TO_TIME('2020-10-28T00:00:00+00:00') + x.y <= '2020-10-28' + AND x.y <= TIME_STR_TO_TIME('2020-10-28T23:59:59.999000+00:00') + AND x.y >= '2020-10-28' + AND x.y >= TIME_STR_TO_TIME('2020-10-28T00:00:00+00:00') """, ) assert_exp_eq( model.render_query(start="2020-10-28", end=to_datetime("2020-10-29")), """ - SELECT y - FROM x + SELECT + x.y AS y + FROM x AS x WHERE - y <= '2020-10-28' - AND y <= TIME_STR_TO_TIME('2020-10-28T23:59:59.999000+00:00') - AND y >= '2020-10-28' - AND y >= TIME_STR_TO_TIME('2020-10-28T00:00:00+00:00') + x.y <= '2020-10-28' + AND x.y <= TIME_STR_TO_TIME('2020-10-28T23:59:59.999000+00:00') + AND x.y >= '2020-10-28' + AND x.y >= TIME_STR_TO_TIME('2020-10-28T00:00:00+00:00') """, ) @@ -685,13 +685,13 @@ def test_filter_time_column(assert_exp_eq): model.render_query(start="2021-01-01", end="2021-01-01", latest="2021-01-01"), """ SELECT - id::INT AS id, - name::TEXT AS name, - price::DOUBLE AS price, - ds::TEXT AS ds - FROM raw.items + items.id::INT AS id, + items.name::TEXT AS name, + items.price::DOUBLE AS price, + items.ds::TEXT AS ds + FROM raw.items AS items WHERE - CAST(ds AS TEXT) <= '20210101' AND CAST(ds as TEXT) >= '20210101' + CAST(items.ds AS TEXT) <= '20210101' AND CAST(items.ds AS TEXT) >= '20210101' """, ) @@ -720,13 +720,13 @@ def test_filter_time_column(assert_exp_eq): model.render_query(start="2021-01-01", end="2021-01-01", latest="2021-01-01"), """ SELECT - id::INT AS id, - name::TEXT AS name, - price::DOUBLE AS price, - ds::TEXT AS ds - FROM raw.items + items.id::INT AS id, + items.name::TEXT AS name, + items.price::DOUBLE AS price, + items.ds::TEXT AS ds + FROM raw.items AS items WHERE - CAST(ds AS TEXT) <= '20210101' AND CAST(ds as TEXT) >= '20210101' + CAST(items.ds AS TEXT) <= '20210101' AND CAST(items.ds as TEXT) >= '20210101' """, ) @@ -761,11 +761,11 @@ def test_parse_model(assert_exp_eq): model.render_query(), """ SELECT - CAST(id AS INT) AS id, - ds - FROM x + CAST(x.id AS INT) AS id, + x.ds AS ds + FROM x AS x WHERE - ds <= '1970-01-01' AND ds >= '1970-01-01' + x.ds <= '1970-01-01' AND x.ds >= '1970-01-01' """, ) diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index d2ad4f21e8..14595ea259 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -246,7 +246,7 @@ def test_fingerprint(model: Model, parent_model: Model): fingerprint = fingerprint_from_model(model, models={}) original_fingerprint = SnapshotFingerprint( - data_hash="3118027933", + data_hash="2278368927", metadata_hash="3589467163", ) @@ -336,24 +336,24 @@ def test_table_name(snapshot: Snapshot): def test_categorize_change(make_snapshot): old_snapshot = make_snapshot(SqlModel(name="a", query=parse_one("select 1, ds"))) - # A projection has been added. - assert ( - categorize_change( - new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, 2, ds"))), - old=old_snapshot, - ) - == SnapshotChangeCategory.NON_BREAKING - ) - - # A complex projection has been added. - assert ( - categorize_change( - new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, fun(a * 2)::INT, ds"))), - old=old_snapshot, - ) - == SnapshotChangeCategory.NON_BREAKING - ) - +# # A projection has been added. +# assert ( +# categorize_change( +# new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, 2, ds"))), +# old=old_snapshot, +# ) +# == SnapshotChangeCategory.NON_BREAKING +# ) +# +# # A complex projection has been added. +# assert ( +# categorize_change( +# new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, fun(a * 2)::INT, ds"))), +# old=old_snapshot, +# ) +# == SnapshotChangeCategory.NON_BREAKING +# ) +# # Multiple projections have been added. assert ( categorize_change( diff --git a/tests/dbt/test_config.py b/tests/dbt/test_config.py index ccb475c57d..f7ca236806 100644 --- a/tests/dbt/test_config.py +++ b/tests/dbt/test_config.py @@ -108,18 +108,18 @@ def test_variables(assert_exp_eq): # Case 2: using a defined variable without a default value defined_variables["foo"] = 6 - assert_exp_eq(model_config.to_sqlmesh(**kwargs).render_query(), "SELECT 6") + assert_exp_eq(model_config.to_sqlmesh(**kwargs).render_query(), 'SELECT 6 AS "6"') # Case 3: using a defined variable with a default value model_config._variables["foo"] = True model_config.sql = "SELECT {{ var('foo', 5) }}" - assert_exp_eq(model_config.to_sqlmesh(**kwargs).render_query(), "SELECT 6") + assert_exp_eq(model_config.to_sqlmesh(**kwargs).render_query(), 'SELECT 6 AS "6"') # Case 4: using an undefined variable with a default value del defined_variables["foo"] - assert_exp_eq(model_config.to_sqlmesh(**kwargs).render_query(), "SELECT 5") + assert_exp_eq(model_config.to_sqlmesh(**kwargs).render_query(), 'SELECT 5 AS "5"') def test_source_config(sushi_dbt_project: Project): From e7def446ed52a0472c11912c917dae264de64f05 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Wed, 15 Feb 2023 17:30:31 -0800 Subject: [PATCH 2/3] Adjust settings for the diffing algorithm in the categorizer to better handle a case when a lot of new projections are being added to a model --- sqlmesh/core/snapshot/categorizer.py | 4 ++-- tests/core/test_snapshot.py | 36 ++++++++++++++-------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/sqlmesh/core/snapshot/categorizer.py b/sqlmesh/core/snapshot/categorizer.py index c3670f24ea..0a88814b95 100644 --- a/sqlmesh/core/snapshot/categorizer.py +++ b/sqlmesh/core/snapshot/categorizer.py @@ -3,7 +3,7 @@ import typing as t from sqlglot import exp -from sqlglot.diff import Insert, Keep, diff +from sqlglot.diff import ChangeDistiller, Insert, Keep from sqlmesh.core.snapshot.definition import Snapshot, SnapshotChangeCategory @@ -33,7 +33,7 @@ def categorize_change(new: Snapshot, old: Snapshot) -> t.Optional[SnapshotChange ): return None - edits = diff(old_model.render_query(), new_model.render_query()) + edits = ChangeDistiller(t=0.5).diff(old_model.render_query(), new_model.render_query()) inserted_expressions = {e.expression for e in edits if isinstance(e, Insert)} for edit in edits: diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index 14595ea259..803ee51eea 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -336,24 +336,24 @@ def test_table_name(snapshot: Snapshot): def test_categorize_change(make_snapshot): old_snapshot = make_snapshot(SqlModel(name="a", query=parse_one("select 1, ds"))) -# # A projection has been added. -# assert ( -# categorize_change( -# new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, 2, ds"))), -# old=old_snapshot, -# ) -# == SnapshotChangeCategory.NON_BREAKING -# ) -# -# # A complex projection has been added. -# assert ( -# categorize_change( -# new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, fun(a * 2)::INT, ds"))), -# old=old_snapshot, -# ) -# == SnapshotChangeCategory.NON_BREAKING -# ) -# + # A projection has been added. + assert ( + categorize_change( + new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, 2, ds"))), + old=old_snapshot, + ) + == SnapshotChangeCategory.NON_BREAKING + ) + + # A complex projection has been added. + assert ( + categorize_change( + new=make_snapshot(SqlModel(name="a", query=parse_one("select 1, fun(a * 2)::INT, ds"))), + old=old_snapshot, + ) + == SnapshotChangeCategory.NON_BREAKING + ) + # Multiple projections have been added. assert ( categorize_change( From 7573b821a7153161bfe453fa7070a623d55dbed0 Mon Sep 17 00:00:00 2001 From: tobymao Date: Wed, 15 Feb 2023 19:49:09 -0800 Subject: [PATCH 3/3] always annotate types --- setup.py | 2 +- sqlmesh/core/renderer.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c6e4a61ebf..66b4506913 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "requests", "rich", "ruamel.yaml", - "sqlglot>=11.1.0", + "sqlglot>=11.1.2", ], extras_require={ "dev": [ diff --git a/sqlmesh/core/renderer.py b/sqlmesh/core/renderer.py index af1d4df71a..5d1425f299 100644 --- a/sqlmesh/core/renderer.py +++ b/sqlmesh/core/renderer.py @@ -30,7 +30,6 @@ qualify_tables, qualify_columns, expand_laterals, - annotate_types, ) @@ -152,6 +151,8 @@ def render( except SqlglotError as ex: raise_config_error(f"Invalid model query. {ex}", self._path) + self._query_cache[cache_key] = annotate_types(self._query_cache[cache_key]) + query = self._query_cache[cache_key] if expand: