In [1]:
# GENERATE INPUT WIDGETS FOR CONFIGURATION

import ipywidgets as widgets
from IPython.display import display
import re
import pandas as pd

widget_user = widgets.Text(
    value='testuser',
    placeholder='Type something',
    description='user: ',
    disabled=False,
    style={'description_width': '100px'}    
)

widget_git_org = widgets.Text(
    value='Nike-Inc',
    placeholder='Type something',
    description='git_org ',
    disabled=False,
    style={'description_width': '100px'}    
)

widget_catalog = widgets.Text(
    value='spark_catalog',
    placeholder='Type something',
    description='catalog:',
    disabled=False,
    style={'description_width': '100px'}    
)

widget_schema = widgets.Text(
    value='default',
    placeholder='Type something',
    description='schema:',
    disabled=False,
    style={'description_width': '100px'}
)

widget_library_source = widgets.Combobox(
    placeholder='Choose source',
    options=['pypi', 'git'],
    description='library_source:',
    ensure_option=True,
    value='git',
    disabled=False,
    style={'description_width': '100px'}
)

widget_git_branch_or_commit = widgets.Text(
    value='main',
    placeholder='Type branch name or commit hash',
    description='git_branch_or_commit:',
    disabled=False,
    style={'description_width': '150px'}
)

widget_override_version = widgets.Checkbox(
    value=False,
    description='Override SE version',
    disabled=False,
    style={'description_width': '30px'}
)

hbox = widgets.HBox([
    widget_user,
    widget_catalog, 
    widget_schema,
    widget_override_version, 
    widget_library_source, 
    widget_git_org,
    widget_git_branch_or_commit
])

In [None]:
# Display widgets
display(hbox)

In [None]:
# Extract configuration values from widgets
user = re.sub(r'[^a-zA-Z]', '', widget_user.value).lower()
catalog = widget_catalog.value
schema = widget_schema.value
override_se_version = widget_override_version.value
library = widget_library_source.value
org = widget_git_org.value
branch_or_commit = widget_git_branch_or_commit.value

print(f"User: {user}")
print(f"Catalog: {catalog}")
print(f"Schema: {schema}")
print(f"Override SE Version: {override_se_version}")
print(f"Library Source: {library}")
print(f"Git Organization: {org}")
print(f"Branch/Commit: {branch_or_commit}")

In [None]:
# Build configuration dictionary
CONFIG = {
    "owner": user,
    "catalog": catalog,
    "schema": schema,
    "user": user,
    "product_id": f"se_{user}_product",
    "rules_table": f"{catalog}.{schema}.se_{user}_rules",
    "stats_table": f"{catalog}.{schema}.se_{user}_stats",
    "customers_table": f"{catalog}.{schema}.se_{user}_customers",
    "orders_table": f"{catalog}.{schema}.se_{user}_orders",
    "products_table": f"{catalog}.{schema}.se_{user}_products",
    "override_se_version": override_se_version,
    "library": library,
    "org": org,
    "branch_or_commit": branch_or_commit
}

config_df = pd.DataFrame(list(CONFIG.items()), columns=['Key', 'Value'])
display(config_df)

In [None]:
# Display current Spark Expectations version
from importlib.metadata import version
print(f"---- Current SparkExpectation Version: {version('spark-expectations')}")

### Setting up Spark Session

In [None]:
# CREATE SPARK SESSION
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark Expectations - DQ Pro Rules Test") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .getOrCreate()

print("‚úÖ Spark session created successfully")

In [None]:
# Show existing databases and tables
databases_df = spark.sql("SHOW DATABASES")
databases_df.show(truncate=False)

tables_df = spark.sql("SHOW TABLES")
tables_df.show(truncate=False)

### Cleanup Existing Tables and Views

In [None]:
# Clean up existing tables and views from previous runs
db_name = f"{CONFIG['catalog']}.{CONFIG['schema']}"
pattern = f"se_{CONFIG['user']}*"

# Set the current catalog
spark.sql(f"USE {CONFIG['catalog']}")

# Drop tables matching pattern
tables_df = spark.sql(f"SHOW TABLES IN {db_name} LIKE '{pattern}'")
tables_to_drop = [row for row in tables_df.collect() if not row["isTemporary"]]

if tables_to_drop:
    print(f"üßπ Found {len(tables_to_drop)} tables to drop.")
    for row in tables_to_drop:
        table_name = row["tableName"]
        spark.sql(f"DROP TABLE IF EXISTS {db_name}.{table_name}")
        print(f"   ‚úì Dropped table: {db_name}.{table_name}")
else:
    print("‚úÖ No tables to drop")

In [None]:
# Drop views matching pattern
views_df = spark.sql(f"SHOW VIEWS in {db_name} LIKE '{pattern}'")
views_to_drop = views_df.collect()

if views_to_drop:
    print(f"üßπ Found {len(views_to_drop)} views to drop.")
    for row in views_to_drop:
        view_name = row["viewName"]
        spark.sql(f"DROP VIEW IF EXISTS {view_name}")
        print(f"   ‚úì Dropped view: {view_name}")
else:
    print("‚úÖ No views to drop")

### Load Rules from YAML File
Now let's load the comprehensive rules from `rules_all_types.yaml`

In [None]:
# Load rules from YAML file
import yaml
import os

# Path to the rules file
rules_file_path = "/app/notebooks/resources/dqpro_rules.yaml"

# Check if file exists
if not os.path.exists(rules_file_path):
    print(f"‚ùå Rules file not found at: {rules_file_path}")
    print("Available files in resources:")
    for file in os.listdir("/app/notebooks/resources"):
        print(f"  - {file}")
else:
    print(f"‚úÖ Loading rules from: {rules_file_path}")
    
    with open(rules_file_path, 'r') as file:
        rules_yaml = yaml.safe_load(file)
    
    print(f"üìã Loaded {len(rules_yaml)} rules from YAML file")

In [None]:
# Convert YAML rules to DataFrame format
rules_data = []

for rule_key, rule_value in rules_yaml.items():
    rule_dict = {
        "product_id": CONFIG["product_id"],
        "table_name": rule_value.get("table_name", ""),
        "rule_type": rule_value.get("rule_type", ""),
        "rule": rule_value.get("rule", ""),
        "column_name": rule_value.get("column_name", ""),
        "expectation": rule_value.get("expectation", ""),
        "action_if_failed": rule_value.get("action_if_failed", ""),
        "tag": rule_value.get("tag", ""),
        "description": rule_value.get("description", ""),
        "enable_for_source_dq_validation": rule_value.get("enable_for_source_dq_validation", True),
        "enable_for_target_dq_validation": rule_value.get("enable_for_target_dq_validation", True),
        "is_active": rule_value.get("is_active", True),
        "enable_error_drop_alert": rule_value.get("enable_error_drop_alert", False),
        "error_drop_threshold": rule_value.get("error_drop_threshold", 0),
        "enable_querydq_custom_output": rule_value.get("enable_querydq_custom_output", False),
        "query_dq_delimiter": rule_value.get("query_dq_delimiter", None),
        "priority": rule_value.get("priority", "medium")
    }
    
    # Update table names to use our config
    if "customers" in rule_dict["table_name"]:
        rule_dict["table_name"] = CONFIG["customers_table"]
    elif "orders" in rule_dict["table_name"]:
        rule_dict["table_name"] = CONFIG["orders_table"]
    elif "products" in rule_dict["table_name"]:
        rule_dict["table_name"] = CONFIG["products_table"]
    
    rules_data.append(rule_dict)

print(f"‚úÖ Converted {len(rules_data)} rules to DataFrame format")

In [None]:
# Create rules DataFrame and display summary
rules_df = spark.createDataFrame(pd.DataFrame(rules_data))

# Show summary of rules by type
print("üìä Rules Summary by Type:")
rules_df.groupBy("rule_type").count().show()

print("\nüìä Rules Summary by Action:")
rules_df.groupBy("action_if_failed").count().show()

print("\nüìä Rules Summary by Tag:")
rules_df.groupBy("tag").count().show()

In [None]:
# Display all DQ PRO rules 
print("üìã ALL DQ Rules loaded from yaml file:")
rules_df.show(truncate=False)

In [None]:
# Save rules to Delta table
rules_df.write.format("delta").mode("overwrite").saveAsTable(CONFIG['rules_table'])
print(f"‚úÖ Rules saved to table: {CONFIG['rules_table']}")

# Verify the table was created
spark.sql(f"SELECT COUNT(*) as rule_count FROM {CONFIG['rules_table']}").show()