In [1]:
import ipywidgets as widgets
from IPython.display import display

import re
import pandas as pd

In [2]:
widget_user = widgets.Text(
    value='testuser',
    placeholder='Type something',
    description='user: ',
    disabled=False,
    style={'description_width': '100px'}    
)

widget_git_org = widgets.Text(
    value='Nike-Inc',
    placeholder='Type something',
    description='git_org ',
    disabled=False,
    style={'description_width': '100px'}    
)

widget_catalog = widgets.Text(
    value='spark_catalog',
    placeholder='Type something',
    description='catalog:',
    disabled=False,
    style={'description_width': '100px'}    
)

widget_schema = widgets.Text(
    value='default',
    placeholder='Type something',
    description='schema:',
    disabled=False,
    style={'description_width': '100px'}
)

widget_library_source = widgets.Combobox(
    placeholder='Choose source',
    options=['pypi', 'git'],
    description='library_source:',
    ensure_option=True,
    value='git',
    disabled=False,
    style={'description_width': '100px'}
)

widget_git_branch_or_commit = widgets.Text(
    value='main',
    placeholder='Type branch name or commit hash',
    description='git_branch_or_commit:',
    disabled=False,
    style={'description_width': '150px'}
)

widget_override_version = widgets.Checkbox(
    value=False,
    description='Override SE version',
    disabled=False,
    style={'description_width': '30px'}
    
)

hbox = widgets.HBox([
    widget_user,
    widget_catalog, 
    widget_schema,
    widget_override_version, 
    widget_library_source, 
    widget_git_org,
    widget_git_branch_or_commit
])

In [3]:
# Display widgets
display(hbox)

HBox(children=(Text(value='testuser', description='user: ', placeholder='Type something', style=TextStyle(desc…

In [4]:
user = re.sub(r'[^a-zA-Z]', '', widget_user.value).lower()
catalog = widget_catalog.value
schema = widget_schema.value
override_se_version = widget_override_version.value
library = widget_library_source.value
org = widget_git_org.value
branch_or_commit = widget_git_branch_or_commit.value

print(user)
print(catalog)
print(schema)
print(override_se_version)
print(library)
print(org)
print(branch_or_commit)

testuser
spark_catalog
default
False
git
Nike-Inc
main


In [5]:
CONFIG = {
    "owner": user,
    "catalog": catalog,
    "schema": schema,
    "user": user,
    "product_id": f"se_{user}_product",
    "in_memory_source": f"se_{user}_source",
    "rules_table": f"{catalog}.{schema}.se_{user}_rules",
    "stats_table": f"{catalog}.{schema}.se_{user}_stats",
    "target_table": f"{catalog}.{schema}.se_{user}_target",
    "override_se_version" : override_se_version,
    "library": library,
    "org": org,
    "branch_or_commit": branch_or_commit
}

config_df = pd.DataFrame(list(CONFIG.items()), columns=['Key', 'Value'])
display(config_df)

Unnamed: 0,Key,Value
0,owner,testuser
1,catalog,spark_catalog
2,schema,default
3,user,testuser
4,product_id,se_testuser_product
5,in_memory_source,se_testuser_source
6,rules_table,spark_catalog.default.se_testuser_rules
7,stats_table,spark_catalog.default.se_testuser_stats
8,target_table,spark_catalog.default.se_testuser_target
9,override_se_version,False


In [6]:
# Display Spark Expectations installed version
from importlib.metadata import version
print(f"---- Current SparkExpectation Version: {version('spark-expectations')}")

---- Current SparkExpectation Version: 2.5.1.dev9+gafe0b380c.d20250909


### Setting up spark expectations

In [7]:
# CREATE SPARK SESSION AND DATABASE
from pyspark.sql import SparkSession

# Create or get a Spark session
spark = SparkSession.builder \
    .appName("Spark Aggregation Rules") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/spark/.ivy2/cache
The jars for the packages stored in: /home/spark/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-04b74fe2-74aa-4ca3-96c6-9a17db094df6;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 63ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.0.0 from central in [default]
	io.delta#delta-storage;3.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |

In [8]:
databases_df = spark.sql("SHOW DATABASES")
databases_df.show(truncate=False)

tables_df = spark.sql("SHOW TABLES")
tables_df.show(truncate=False)


+---------+
|namespace|
+---------+
|default  |
+---------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [9]:
db_name = f"{CONFIG['catalog']}.{CONFIG['schema']}"
pattern = f"se_{CONFIG['user']}*"

# Set the current catalog
spark.sql(f"USE {CONFIG['catalog']}")

# Drop tables matching pattern
tables_df = spark.sql(f"SHOW TABLES IN {db_name} LIKE '{pattern}'")
tables_to_drop = [row for row in tables_df.collect() if not row["isTemporary"] ]

if tables_to_drop:
    print(f"Found {len(tables_to_drop)} tables to drop.")
    for row in tables_to_drop:
        table_name = row["tableName"]
        spark.sql(f"DROP TABLE IF EXISTS {db_name}.{table_name}")
        print(f"Dropped table: {db_name}.{table_name}")
else:
    print("----- No tables to drop")

----- No tables to drop


In [10]:
views_df = spark.sql(f"SHOW VIEWS in {db_name} LIKE '{pattern}'")
views_to_drop = views_df.collect()

if views_to_drop:
    print(f"Found {len(views_to_drop)} views to drop.")
    for row in views_to_drop:
        view_name = row["viewName"]
        spark.sql(f"DROP VIEW IF EXISTS {view_name}")
        print(f"Dropped view: {view_name}")
else:
    print("----- No views to drop")

----- No views to drop


### Spark expectation execution

In [11]:
import pandas as pd
from pyspark.sql import SparkSession

In [None]:
rules_data = [
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "agg_dq",
        "rule": "data_existing",
        "column_name": "sales",
        "expectation": "count(*) > 0",
        "action_if_failed": "fail",
        "tag": "completeness",
        "description": "Data should be present",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
    },
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "agg_dq",
        "rule": "min_sales",
        "column_name": "sales",
        "expectation": "min(sales)>1000",
        "action_if_failed": "warn",
        "tag": "validity",
        "description": "Minimum sales should be greater than 1000",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
    },
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "agg_dq",
        "rule": "no_duplicates",
        "column_name": "id",
        "expectation": "count(distinct id) == count(*)",
        "action_if_failed": "fail",
        "tag": "uniqueness",
        "description": "Each data point should have a unique id",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
    }
]

In [78]:
rules_df = spark.createDataFrame(pd.DataFrame(rules_data))

In [79]:
rules_df.write.format("delta").mode("overwrite").saveAsTable(CONFIG['rules_table'])

In [80]:
rules_df.toPandas()

Unnamed: 0,product_id,table_name,rule_type,rule,column_name,expectation,action_if_failed,tag,description,enable_for_source_dq_validation,enable_for_target_dq_validation,is_active,enable_error_drop_alert,error_drop_threshold
0,se_testuser_product,spark_catalog.default.se_testuser_target,agg_dq,data_existing,sales,count(*) > 0,fail,completeness,Data should be present,True,True,True,False,0
1,se_testuser_product,spark_catalog.default.se_testuser_target,agg_dq,min_sales,sales,min(sales)>1000,warn,validity,Minimum sales should be greater than 1000,True,True,True,False,0
2,se_testuser_product,spark_catalog.default.se_testuser_target,agg_dq,no_duplicates,id,count(distinct id) == count(*),fail,uniqueness,Each data point should have a unique id,True,True,True,False,0


### Running spark expectations

In [81]:
from pyspark.sql import DataFrame

from spark_expectations.core import load_configurations
from spark_expectations.config.user_config import Constants as user_config

from spark_expectations.core.expectations import (
    SparkExpectations,
    WrappedDataFrameWriter,
)

In [82]:
writer = WrappedDataFrameWriter().mode("overwrite").format("delta")

In [83]:
load_configurations(spark) 

In [84]:
# Custom config (example enable slack/email notifications)
stats_streaming_config_dict = {user_config.se_enable_streaming: False}
notification_conf = {}

In [85]:
se = SparkExpectations(
    product_id=CONFIG["product_id"],
    rules_df=rules_df,
    stats_table=CONFIG["stats_table"],
    stats_table_writer=writer,
    target_and_error_table_writer=writer,
    stats_streaming_options=stats_streaming_config_dict,
)

In [86]:
#  Initialize input data
data = [
    {"id": 1, "name": "Alice",  "sales": 3500},
    {"id": 2, "name": "Bob",   "sales": 2800},
    {"id": 3, "name": "Charlie", "sales": 4200},
    {"id": 4, "name": "Mike",   "sales": 2300},
    {"id": 5, "name": "Ron", "sales": None},
    {"id": 6, "name": "Zach",   "sales": 3900},
    {"id": 7, "name": "Alex",   "sales": 4100},
    {"id": 8, "name": "Steve",   "sales": 4200},
    {"id": 9, "name": "James",   "sales": 900},
    {"id": 10, "name": "Dan",   "sales": 2500},
    {"id": 4, "name": "Bryan",   "sales": 1600}
]
input_df = spark.createDataFrame(pd.DataFrame(data))
input_df.show(truncate=False)

+---+-------+------+
|id |name   |sales |
+---+-------+------+
|1  |Alice  |3500.0|
|2  |Bob    |2800.0|
|3  |Charlie|4200.0|
|4  |Mike   |2300.0|
|5  |Ron    |NaN   |
|6  |Zach   |3900.0|
|7  |Alex   |4100.0|
|8  |Steve  |4200.0|
|9  |James  |900.0 |
|10 |Dan    |2500.0|
|4  |Bryan  |1600.0|
+---+-------+------+



In [87]:
@se.with_expectations(
    target_table=CONFIG["target_table"],
    write_to_table=True,
    write_to_temp_table=True,
    user_conf=notification_conf,
)
def get_dataset():
    _df_source: DataFrame = input_df
    _df_source.createOrReplaceTempView(CONFIG["in_memory_source"])
    return _df_source

In [88]:
get_dataset()

[2025-09-10 02:06:26,930] [INFO] [spark_expectations] {expectations.py:wrapper:325} - The function dataframe is getting created
[2025-09-10 02:06:27,139] [INFO] [spark_expectations] {expectations.py:wrapper:341} - Validation for rules completed successfully
[2025-09-10 02:06:27,202] [INFO] [spark_expectations] {expectations.py:wrapper:344} - data frame input record count: 11
[2025-09-10 02:06:27,203] [INFO] [spark_expectations] {expectations.py:wrapper:356} - initialize variable with default values before next run
[2025-09-10 02:06:27,203] [INFO] [spark_expectations] {expectations.py:wrapper:390} - Spark Expectations run id for this run: se_testuser_product_bdb44ee8-8dea-11f0-b465-7a2b7591a4dc
[2025-09-10 02:06:27,203] [INFO] [spark_expectations] {expectations.py:wrapper:393} - The function dataframe is created
[2025-09-10 02:06:27,203] [INFO] [spark_expectations] {expectations.py:wrapper:396} - Dropping to temp table started
[2025-09-10 02:06:27,207] [INFO] [spark_expectations] {expec

SparkExpectationsMiscException: error occurred while processing spark expectations error occurred while executing func_process error occurred while taking action on given rules Job failed, as there is a data quality issue at agg_dq expectations and the action_if_failed suggested to fail

In [89]:
query_stats_table = f"SELECT * FROM {CONFIG['stats_table']}"
query_stats_table_df = spark.sql(query_stats_table).toPandas()
query_stats_table_df

25/09/10 02:06:34 ERROR NonFateSharingFuture: Failed to get result from future
scala.runtime.NonLocalReturnControl


Unnamed: 0,product_id,table_name,input_count,error_count,output_count,output_percentage,success_percentage,error_percentage,source_agg_dq_results,final_agg_dq_results,...,final_query_dq_results,row_dq_res_summary,row_dq_error_threshold,dq_status,dq_run_time,dq_rules,meta_dq_run_id,meta_dq_run_date,meta_dq_run_datetime,dq_env
0,se_testuser_product,spark_catalog.default.se_testuser_target,11,0,0,0.0,100.0,0.0,"[{'rule_type': 'agg_dq', 'column_name': 'sales...",,...,,,,"{'final_query_dq': 'Skipped', 'source_agg_dq':...","{'row_dq_run_time': 0.0, 'source_query_dq_run_...",{'query_dq_rules': {'num_final_query_dq_rules'...,se_testuser_product_bdb44ee8-8dea-11f0-b465-7a...,2025-09-10,2025-09-10 02:06:19,


In [90]:
# Failure of rules captured here
query_stats_table_df.loc[0,'source_agg_dq_results']

[{'rule_type': 'agg_dq',
  'column_name': 'sales',
  'description': 'Data should be present',
  'rule': 'data_existing',
  'tag': 'completeness',
  'action_if_failed': 'fail',
  'status': 'pass'},
 {'rule_type': 'agg_dq',
  'column_name': 'sales',
  'description': 'Minimum sales should be greater than 1000',
  'rule': 'min_sales',
  'tag': 'validity',
  'action_if_failed': 'warn',
  'status': 'fail'},
 {'rule_type': 'agg_dq',
  'column_name': 'id',
  'description': 'Each data point should have a unique id',
  'rule': 'no_duplicates',
  'tag': 'uniqueness',
  'action_if_failed': 'fail',
  'status': 'fail'}]

In [91]:
databases_df = spark.sql("SHOW DATABASES")
databases_df.show(truncate=False)

tables_df = spark.sql("SHOW TABLES")
tables_df.show(truncate=False)


+---------+
|namespace|
+---------+
|default  |
+---------+

+---------+-----------------------+-----------+
|namespace|tableName              |isTemporary|
+---------+-----------------------+-----------+
|default  |se_testuser_rules      |false      |
|default  |se_testuser_stats      |false      |
|default  |se_testuser_target     |false      |
|default  |se_testuser_target_temp|false      |
|         |se_testuser_source     |true       |
+---------+-----------------------+-----------+



In [92]:
query_target_table = f"""
SELECT *
FROM {CONFIG['target_table']} 
ORDER BY meta_dq_run_id, id
"""

final_data_set_df = spark.sql(query_target_table)

if final_data_set_df is not None:
    final_data_set_df.show(truncate=False)

+---+-------+------+--------------------------------------------------------+--------------------+
|id |name   |sales |meta_dq_run_id                                          |meta_dq_run_datetime|
+---+-------+------+--------------------------------------------------------+--------------------+
|1  |Alice  |3500.0|se_testuser_product_ab27526c-8de9-11f0-b465-7a2b7591a4dc|2025-09-10 01:58:38 |
|2  |Bob    |2800.0|se_testuser_product_ab27526c-8de9-11f0-b465-7a2b7591a4dc|2025-09-10 01:58:38 |
|3  |Charlie|4200.0|se_testuser_product_ab27526c-8de9-11f0-b465-7a2b7591a4dc|2025-09-10 01:58:38 |
|4  |Bryan  |1600.0|se_testuser_product_ab27526c-8de9-11f0-b465-7a2b7591a4dc|2025-09-10 01:58:38 |
|4  |Mike   |2300.0|se_testuser_product_ab27526c-8de9-11f0-b465-7a2b7591a4dc|2025-09-10 01:58:38 |
|5  |Ron    |NaN   |se_testuser_product_ab27526c-8de9-11f0-b465-7a2b7591a4dc|2025-09-10 01:58:38 |
|6  |Zach   |3900.0|se_testuser_product_ab27526c-8de9-11f0-b465-7a2b7591a4dc|2025-09-10 01:58:38 |
|7  |Alex 

In [93]:
input_count = spark.sql(f"SELECT COUNT(*) AS count FROM {CONFIG['in_memory_source']}").collect()[0]['count']
output_count = spark.sql(f"SELECT COUNT(*) AS count FROM {CONFIG['target_table']}").collect()[0]['count']

# Find missing rows in target_table that are present in in_memory_source
removed_rows_df = spark.sql(f"""
SELECT s.*
FROM {CONFIG['in_memory_source']} s
LEFT ANTI JOIN {CONFIG['target_table']} t
ON s.id = t.id
""")

removed_rows_count = removed_rows_df.count()

comparison_df = spark.createDataFrame(
    [
        ("input", input_count),
        ("output", output_count),
        ("removed_records", removed_rows_count)
    ],
    ["table", "record_count"]
)

comparison_df.show()

if removed_rows_count > 0:
    removed_rows_df.show(truncate=False)

+---------------+------------+
|          table|record_count|
+---------------+------------+
|          input|          11|
|         output|          11|
|removed_records|           0|
+---------------+------------+

