In [1]:
%pip install sqlglot





[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Evaluation steps

 

### Model Eval
   - Unions

   - CTEs & Recursive CTEs

   - Time travel syntax

   - Sub-queries

 

### Column Eval
  - Aliases

  - "SELECT *"

  - Calculated/Multi-column fields

  - Window Functions

    - Qualified Column Refs

 

### Other

   - Masking salt key in output

# Models Eval

## Unions

In [2]:
import sqlglot
from sqlglot import expressions as exp

"""
Can I assume all snowflake compiled models will be formatted as db.schema.tbl? I think?
"""


# Example UNION query in Snowflake syntax (no quotes)
union_query = """
SELECT CUSTOMER_ID, ORDER_DATE, 'online' AS CHANNEL
FROM ECOMMERCE_DB.SALES.ONLINE_ORDERS
WHERE ORDER_DATE >= '2024-01-01'
UNION ALL
SELECT CUSTOMER_ID, PURCHASE_DATE AS ORDER_DATE, 'retail' AS CHANNEL
FROM ECOMMERCE_DB.SALES.RETAIL_SALES
WHERE PURCHASE_DATE >= '2024-01-01'
UNION
SELECT CUST_ID AS CUSTOMER_ID, TRANSACTION_DATE AS ORDER_DATE, 'mobile' AS CHANNEL
FROM MOBILE_APP_DB.TRANSACTIONS.MOBILE_TRANSACTIONS
WHERE TRANSACTION_DATE >= '2024-01-01'
"""

def extract_snowflake_tables(sql_query):
    parsed = sqlglot.parse_one(sql_query, dialect="snowflake")
    tables = set()
    for node in parsed.walk():
        if isinstance(node, exp.Table):
            db = node.catalog or ""
            schema = node.db or ""
            name = node.name
            # Build full table name: DATABASE.SCHEMA.TABLE (no quotes)
            if db and schema:
                full_name = f"{db}.{schema}.{name}"
            elif schema:
                full_name = f"{schema}.{name}"
            else:
                full_name = name
            tables.add(full_name)
    return sorted(tables)

# Test extraction
tables = extract_snowflake_tables(union_query)
print("Snowflake tables found:")
for t in tables:
    print(f"  - {t}")

Snowflake tables found:
  - ECOMMERCE_DB.SALES.ONLINE_ORDERS
  - ECOMMERCE_DB.SALES.RETAIL_SALES
  - MOBILE_APP_DB.TRANSACTIONS.MOBILE_TRANSACTIONS


In [3]:
test_queries = [
    # 1. Simple UNION with single tables
    """
    SELECT id FROM db1.schema1.tableA
    UNION
    SELECT id FROM db2.schema2.tableB
    """,

    # 2. UNION ALL with JOIN and subquery
    """
    SELECT u.user_id, o.order_id
    FROM analytics.users u
    JOIN analytics.orders o ON u.user_id = o.user_id
    UNION ALL
    SELECT user_id, NULL
    FROM analytics.inactive_users
    WHERE last_login < '2024-01-01'
    """,

    # 3. UNION with nested SELECT and CTE
    """
    WITH recent_orders AS (
        SELECT order_id, customer_id
        FROM sales.orders
        WHERE order_date > '2025-01-01'
    )
    SELECT customer_id FROM recent_orders
    UNION
    SELECT customer_id FROM sales.customers
    WHERE signup_date > '2025-01-01'
    UNION ALL
    SELECT customer_id FROM marketing.leads
    WHERE source = 'web'
    """
]

for i, q in enumerate(test_queries, 1):
    tables = extract_snowflake_tables(q)
    print(f"\nTest case {i}:")
    for t in tables:
        print(f"  - {t}")


Test case 1:
  - db1.schema1.tableA
  - db2.schema2.tableB

Test case 2:
  - analytics.inactive_users
  - analytics.orders
  - analytics.users

Test case 3:
  - marketing.leads
  - recent_orders
  - sales.customers
  - sales.orders


## CTEs and Recursive CTES

In [4]:
test_queries.append(
    """
    WITH active_customers AS (
        SELECT customer_id
        FROM crm_db.sales.customers
        WHERE status = 'active'
    ),
    recent_orders AS (
        SELECT order_id, customer_id
        FROM crm_db.sales.orders
        WHERE order_date > '2025-01-01'
    ),
    top_products AS (
        SELECT product_id
        FROM crm_db.sales.products
        WHERE rating > 4.5
    )
    SELECT ac.customer_id, ro.order_id
    FROM active_customers ac
    JOIN recent_orders ro ON ac.customer_id = ro.customer_id
    UNION
    SELECT customer_id, NULL
    FROM crm_db.marketing.leads
    WHERE source = 'web'
    UNION ALL
    SELECT NULL, order_id
    FROM recent_orders
    WHERE order_id NOT IN (SELECT order_id FROM crm_db.sales.returns)
    """
)

for i, q in enumerate(test_queries, 1):
    tables = extract_snowflake_tables(q)
    print(f"\nTest case {i}:")
    for t in tables:
        print(f"  - {t}")


Test case 1:
  - db1.schema1.tableA
  - db2.schema2.tableB

Test case 2:
  - analytics.inactive_users
  - analytics.orders
  - analytics.users

Test case 3:
  - marketing.leads
  - recent_orders
  - sales.customers
  - sales.orders

Test case 4:
  - active_customers
  - crm_db.marketing.leads
  - crm_db.sales.customers
  - crm_db.sales.orders
  - crm_db.sales.products
  - crm_db.sales.returns
  - recent_orders


In [5]:
test_queries.append(
    """
    WITH __dbt__cte__dummy_data AS (
        SELECT
            upper(nullif(v:DUMMY_VER_NAME::STRING,'')) AS dummy_ver_name,
            upper(nullif(v:DUMMY_POP_NAME::STRING,'')) AS dummy_pop_name,
            upper(nullif(v:DUMMY_LEVEL_CD::STRING,'')) AS dummy_level_cd,
            upper(nullif(v:DUMMY_VAR_NAME::STRING,'')) AS dummy_var_name,
            nullif(v:DUMMY_COEF::STRING,'')::NUMBER(8,3) AS dummy_coef
        FROM dummy_schema.dummy_table
    ),
    get_dummy_data AS (
        SELECT
            dummy_ver_name,
            dummy_pop_name,
            dummy_level_cd,
            dummy_var_name,
            dummy_coef
        FROM __dbt__cte__dummy_data
    )
    SELECT
        COALESCE(gd.dummy_ver_name::VARCHAR, '') || '~' || COALESCE(gd.dummy_pop_name::VARCHAR, '') || '~' || COALESCE(gd.dummy_level_cd::VARCHAR, '') || '~' || COALESCE(gd.dummy_var_name::VARCHAR, '') AS dummy_id,
        dd.dummy_key,
        gd.dummy_ver_name,
        gd.dummy_pop_name,
        gd.dummy_level_cd,
        gd.dummy_var_name,
        gd.dummy_coef
    FROM get_dummy_data gd
    INNER JOIN dummy_schema.dummy_dim dd ON gd.dummy_ver_name = dd.dummy_ver_name
    """
)

for i, q in enumerate(test_queries, 1):
    tables = extract_snowflake_tables(q)
    print(f"\nTest case {i}:")
    for t in tables:
        print(f"  - {t}")



Test case 1:
  - db1.schema1.tableA
  - db2.schema2.tableB

Test case 2:
  - analytics.inactive_users
  - analytics.orders
  - analytics.users

Test case 3:
  - marketing.leads
  - recent_orders
  - sales.customers
  - sales.orders

Test case 4:
  - active_customers
  - crm_db.marketing.leads
  - crm_db.sales.customers
  - crm_db.sales.orders
  - crm_db.sales.products
  - crm_db.sales.returns
  - recent_orders

Test case 5:
  - __dbt__cte__dummy_data
  - dummy_schema.dummy_dim
  - dummy_schema.dummy_table
  - get_dummy_data


In [6]:
def extract_snowflake_tables(sql_query):
    parsed = sqlglot.parse_one(sql_query, dialect="snowflake")
    tables = set()
    cte_names = set()

    # Collect CTE names
    for node in parsed.find_all(exp.CTE):
        if node.alias:
            cte_names.add(node.alias)

    # Collect all table references
    for node in parsed.walk():
        if isinstance(node, exp.Table):
            db = node.catalog or ""
            schema = node.db or ""
            name = node.name
            if db and schema:
                full_name = f"{db}.{schema}.{name}"
            elif schema:
                full_name = f"{schema}.{name}"
            else:
                full_name = name
            tables.add(full_name)

    # Separate physical tables from CTEs
    physical_tables = [t for t in tables if t not in cte_names]
    return sorted(physical_tables), sorted(cte_names)



for i, q in enumerate(test_queries, 1):
    physical_tables, cte_names = extract_snowflake_tables(q)
    print(f"\nTest case {i}:")
    print("  Physical tables:")
    for t in physical_tables:
        print(f"    - {t}")
    print("  CTE names:")
    for c in cte_names:
        print(f"    - {c}")


Test case 1:
  Physical tables:
    - db1.schema1.tableA
    - db2.schema2.tableB
  CTE names:

Test case 2:
  Physical tables:
    - analytics.inactive_users
    - analytics.orders
    - analytics.users
  CTE names:

Test case 3:
  Physical tables:
    - marketing.leads
    - sales.customers
    - sales.orders
  CTE names:
    - recent_orders

Test case 4:
  Physical tables:
    - crm_db.marketing.leads
    - crm_db.sales.customers
    - crm_db.sales.orders
    - crm_db.sales.products
    - crm_db.sales.returns
  CTE names:
    - active_customers
    - recent_orders
    - top_products

Test case 5:
  Physical tables:
    - dummy_schema.dummy_dim
    - dummy_schema.dummy_table
  CTE names:
    - __dbt__cte__dummy_data
    - get_dummy_data


## Timestamp Example

In [7]:
test_queries.append(
    """
    WITH __dbt__cte__dummy_data AS (
        SELECT
            upper(nullif(v:DUMMY_VER_NAME::STRING,'')) AS dummy_ver_name,
            upper(nullif(v:DUMMY_POP_NAME::STRING,'')) AS dummy_pop_name,
            upper(nullif(v:DUMMY_LEVEL_CD::STRING,'')) AS dummy_level_cd,
            upper(nullif(v:DUMMY_VAR_NAME::STRING,'')) AS dummy_var_name,
            nullif(v:DUMMY_COEF::STRING,'')::NUMBER(8,3) AS dummy_coef
        FROM dummy_schema.dummy_table AT (TIMESTAMP => '2025-07-31 00:00:00')
    ),
    get_dummy_data AS (
        SELECT
            dummy_ver_name,
            dummy_pop_name,
            dummy_level_cd,
            dummy_var_name,
            dummy_coef
        FROM __dbt__cte__dummy_data
    )
    SELECT
        COALESCE(gd.dummy_ver_name::VARCHAR, '') || '~' || COALESCE(gd.dummy_pop_name::VARCHAR, '') || '~' || COALESCE(gd.dummy_level_cd::VARCHAR, '') || '~' || COALESCE(gd.dummy_var_name::VARCHAR, '') AS dummy_id,
        dd.dummy_key,
        gd.dummy_ver_name,
        gd.dummy_pop_name,
        gd.dummy_level_cd,
        gd.dummy_var_name,
        gd.dummy_coef
    FROM get_dummy_data gd
    INNER JOIN dummy_schema.dummy_dim dd ON gd.dummy_ver_name = dd.dummy_ver_name
    """
)

for i, q in enumerate(test_queries, 1):
    physical_tables, cte_names = extract_snowflake_tables(q)
    print(f"\nTest case {i}:")
    print("  Physical tables:")
    for t in physical_tables:
        print(f"    - {t}")
    print("  CTE names:")
    for c in cte_names:
        print(f"    - {c}")



Test case 1:
  Physical tables:
    - db1.schema1.tableA
    - db2.schema2.tableB
  CTE names:

Test case 2:
  Physical tables:
    - analytics.inactive_users
    - analytics.orders
    - analytics.users
  CTE names:

Test case 3:
  Physical tables:
    - marketing.leads
    - sales.customers
    - sales.orders
  CTE names:
    - recent_orders

Test case 4:
  Physical tables:
    - crm_db.marketing.leads
    - crm_db.sales.customers
    - crm_db.sales.orders
    - crm_db.sales.products
    - crm_db.sales.returns
  CTE names:
    - active_customers
    - recent_orders
    - top_products

Test case 5:
  Physical tables:
    - dummy_schema.dummy_dim
    - dummy_schema.dummy_table
  CTE names:
    - __dbt__cte__dummy_data
    - get_dummy_data

Test case 6:
  Physical tables:
    - dummy_schema.dummy_dim
    - dummy_schema.dummy_table
  CTE names:
    - __dbt__cte__dummy_data
    - get_dummy_data


## Derived/Sub-query example

In [None]:
test_queries.append(
    """
    WITH __dbt__cte__dummy_data AS (
        SELECT
            upper(nullif(v:DUMMY_VER_NAME::STRING,'')) AS dummy_ver_name,
            upper(nullif(v:DUMMY_POP_NAME::STRING,'')) AS dummy_pop_name,
            upper(nullif(v:DUMMY_LEVEL_CD::STRING,'')) AS dummy_level_cd,
            upper(nullif(v:DUMMY_VAR_NAME::STRING,'')) AS dummy_var_name,
            nullif(v:DUMMY_COEF::STRING,'')::NUMBER(8,3) AS dummy_coef
        FROM dummy_schema.dummy_table AT (TIMESTAMP => '2025-07-31 00:00:00')
    ),
    get_dummy_data AS (
        SELECT
            dummy_ver_name,
            dummy_pop_name,
            dummy_level_cd,
            dummy_var_name,
            dummy_coef
        FROM __dbt__cte__dummy_data
    )
    SELECT
        COALESCE(gd.dummy_ver_name::VARCHAR, '') || '~' || COALESCE(gd.dummy_pop_name::VARCHAR, '') || '~' || COALESCE(gd.dummy_level_cd::VARCHAR, '') || '~' || COALESCE(gd.dummy_var_name::VARCHAR, '') AS dummy_id,
        dd.dummy_key,
        gd.dummy_ver_name,
        gd.dummy_pop_name,
        gd.dummy_level_cd,
        gd.dummy_var_name,
        gd.dummy_coef,
        sub.latest_status
    FROM get_dummy_data gd
    INNER JOIN (
        SELECT
            dummy_ver_name,
            MAX(status_date) AS latest_status
        FROM dummy_schema.dummy_status
        WHERE status_code IN (
            SELECT code FROM dummy_schema.status_codes WHERE is_active = 1
        )
        GROUP BY dummy_ver_name
    ) sub ON gd.dummy_ver_name = sub.dummy_ver_name
    WHERE gd.dummy_coef > (
        SELECT AVG(dummy_coef) FROM dummy_schema.dummy_table WHERE dummy_level_cd = gd.dummy_level_cd
    )
    """
)



In [13]:
def extract_snowflake_tables(sql_query):
    parsed = sqlglot.parse_one(sql_query, dialect="snowflake")
    cte_names = set()
    table_aliases = dict()
    all_physical_tables = set()
    join_subquery_tables = set()
    where_subquery_tables = set()
    cte_source_tables = set()

    # Collect CTE names and their source tables
    for cte in parsed.find_all(exp.CTE):
        if cte.alias:
            cte_names.add(cte.alias)
        # Find tables referenced inside CTE definitions
        for node in cte.find_all(exp.Table):
            db = node.catalog or ""
            schema = node.db or ""
            name = node.name
            if db and schema:
                full_name = f"{db}.{schema}.{name}"
            elif schema:
                full_name = f"{schema}.{name}"
            else:
                full_name = name
            if full_name not in cte_names:
                cte_source_tables.add(full_name)

    # Helper to get full table name
    def get_full_name(node):
        db = node.catalog or ""
        schema = node.db or ""
        name = node.name
        if db and schema:
            return f"{db}.{schema}.{name}"
        elif schema:
            return f"{schema}.{name}"
        else:
            return name

    # Collect all physical tables (not CTEs) anywhere in the query
    for node in parsed.walk():
        if isinstance(node, exp.Table):
            full_name = get_full_name(node)
            if full_name not in cte_names:
                all_physical_tables.add(full_name)
            if node.alias:
                table_aliases[node.alias] = full_name

    # Collect tables in JOIN subqueries and derived tables
    for join in parsed.find_all(exp.Join):
        for subquery in join.find_all(exp.Subquery):
            for node in subquery.walk():
                if isinstance(node, exp.Table):
                    tbl = get_full_name(node)
                    if tbl not in cte_names:
                        join_subquery_tables.add(tbl)
                    if node.alias:
                        table_aliases[node.alias] = tbl

    # Collect tables in WHERE subqueries
    for where in parsed.find_all(exp.Where):
        for subquery in where.find_all(exp.Subquery):
            for node in subquery.walk():
                if isinstance(node, exp.Table):
                    tbl = get_full_name(node)
                    if tbl not in cte_names:
                        where_subquery_tables.add(tbl)
                    if node.alias:
                        table_aliases[node.alias] = tbl

    valuable_join_tables = sorted(join_subquery_tables - where_subquery_tables)

    # Only include as source/target if:
    # - referenced in a CTE definition (cte_source_tables)
    # - or referenced outside of WHERE subqueries (i.e., not only in where_subquery_tables)
    source_target_tables = sorted(
        t for t in all_physical_tables
        if t in cte_source_tables or t not in where_subquery_tables
    )

    return (
        source_target_tables,
        sorted(cte_names),
        valuable_join_tables,
        sorted(where_subquery_tables),
        table_aliases
    )

# Example usage and test logic:
for i, q in enumerate(test_queries, 1):
    target_tables, cte_names, join_subquery_tables, where_subquery_tables, table_aliases = extract_snowflake_tables(q)
    print(f"\nTest case {i}:")
    print("  Source/target tables (all physical tables):")
    for t in target_tables:
        print(f"    - {t}")
    print("  CTE names:")
    for c in cte_names:
        print(f"    - {c}")
    print("  JOIN/derived subquery tables (valuable for lineage):")
    for j in join_subquery_tables:
        print(f"    - {j}")
    print("  WHERE subquery tables (not useful for lineage):")
    for w in where_subquery_tables:
        print(f"    - {w}")
    print("  Table aliases:")
    for alias, table in table_aliases.items():
        print(f"    {alias} -> {table}")


Test case 1:
  Source/target tables (all physical tables):
    - db1.schema1.tableA
    - db2.schema2.tableB
  CTE names:
  JOIN/derived subquery tables (valuable for lineage):
  WHERE subquery tables (not useful for lineage):
  Table aliases:

Test case 2:
  Source/target tables (all physical tables):
    - analytics.inactive_users
    - analytics.orders
    - analytics.users
  CTE names:
  JOIN/derived subquery tables (valuable for lineage):
  WHERE subquery tables (not useful for lineage):
  Table aliases:
    u -> analytics.users
    o -> analytics.orders

Test case 3:
  Source/target tables (all physical tables):
    - marketing.leads
    - sales.customers
    - sales.orders
  CTE names:
    - recent_orders
  JOIN/derived subquery tables (valuable for lineage):
  WHERE subquery tables (not useful for lineage):
  Table aliases:

Test case 4:
  Source/target tables (all physical tables):
    - crm_db.marketing.leads
    - crm_db.sales.customers
    - crm_db.sales.orders
    - crm_d

# Columns Eval

## Aliases