### Spark - Expectations - User - Guide - Documentation
* Please read through the [Spark Expectation Documentation](https://engineering.nike.com/spark-expectations) before proceeding with this demo

#### widgets 
* `catalog`, `schema` - define where output tables are going to be created 
  * Tables are going to be prefixed with logged in DBX user
* `library_source` combo box defines library url(git branch or pypi) from where to pull library 
  * `pypy` ( installs latest published version available in PyPy)
  * `git` ( installs library from specified git branch)
    * Set `git_branch` input field to match git branch (example `main`)  

### Initialize notebook config
This step will read widget values needed to explain from where to install Spark-Expectation library as well as define schema, catalog and table naming


In [0]:
%python

import re
import pandas as pd

logged_in_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
logged_in_user = logged_in_user.split('@')[0]

user = re.sub(r'[^a-zA-Z]', '', logged_in_user).lower()

catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
library = dbutils.widgets.get("library_source")
org = dbutils.widgets.get("git_org")
branch_or_commit = dbutils.widgets.get("git_branch_or_commit")

CONFIG = {
    "owner": user,
    "catalog": catalog,
    "schema": schema,
    "user": user,
    "product_id": f"se_{user}_product",
    "in_memory_source": f"se_{user}_source",
    "rules_table": f"{catalog}.{schema}.se_{user}_rules",
    "stats_table": f"{catalog}.{schema}.se_{user}_stats",
    "target_table": f"{catalog}.{schema}.se_{user}_target",
    "library": library,
    "org": org,
    "branch_or_commit": branch_or_commit
}

config_df = pd.DataFrame(list(CONFIG.items()), columns=['Key', 'Value'])
display(config_df)

### Install - Required - Libraries
* Spark Expectations
* Jinja2 (required for using custom templates) 


In [0]:

if CONFIG["library"] == "pypi":
  print("-----INSTALLING SPARK-EXPECTATIONS from PyPi")
  %pip install spark-expectations
elif CONFIG["library"] == "git":
  print(f"-----INSTALLING SPARK-EXPECTATIONS from Git Org/User {CONFIG['org']}, Branch/Commit {CONFIG['branch_or_commit']}")
  giturl = f"git+https://github.com/{CONFIG['org']}/spark-expectations.git@{CONFIG['branch_or_commit']}"
  %pip install --force-reinstall {giturl}

print("-----INSTALLING Jinja2 template library")
%pip install jinja2 

%restart_python

# Cleanup 
- Removing previosly created user prefixed tables

In [0]:
db_name = f"{CONFIG['catalog']}.{CONFIG['schema']}"
pattern = f"se_{CONFIG['user']}*"

# Set the current catalog
spark.sql(f"USE CATALOG {CONFIG['catalog']}")

# Drop tables matching pattern
tables_df = spark.sql(f"SHOW TABLES IN {db_name} LIKE '{pattern}'")
tables_to_drop = [row for row in tables_df.collect() if not row["isTemporary"] ]

if tables_to_drop:
    print(f"Found {len(tables_to_drop)} tables to drop.")
    for row in tables_to_drop:
        table_name = row["tableName"]
        spark.sql(f"DROP TABLE IF EXISTS {db_name}.{table_name}")
        print(f"Dropped table: {db_name}.{table_name}")
else:
    print("----- No tables to drop")

# Drop global and local temp views matching pattern

views_df = spark.sql(f"SHOW VIEWS in {db_name} LIKE '{pattern}'")
views_to_drop = views_df.collect()

if views_to_drop:
    print(f"Found {len(views_to_drop)} views to drop.")
    for row in views_to_drop:
        view_name = row["viewName"]
        spark.sql(f"DROP VIEW IF EXISTS {view_name}")
        print(f"Dropped view: {view_name}")
else:
    print("----- No views to drop")

### Run Spark Expectation

Steps:
1. Create rules dataframe

2. Provide custom SparkExpectation config that overwrites default config
- stats_streaming_options, 
- user_conf 

3. Initialize sparkExpectation
- loading input dataset
- running sparkExpectations

In [0]:
# Getting Started with Spark Expectations: Simple Example

## 1. Sample Source Dataset
# initialize simple Pandas DataFrame and convert it to a Spark DataFrame

import pandas as pd
from pyspark.sql import SparkSession

# Initialize Default Configuration  (not needed when running in Databricks notebook)
spark = SparkSession.builder.getOrCreate()


## 2. Define Simple `row_dq` Rules
# Create a rules DataFrame with a few simple data quality rules

rules_data = [
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "row_dq",
        "rule": "age_not_null",
        "column_name": "age",
        "expectation": "age IS NOT NULL",
        "action_if_failed": "drop",
        "tag": "completeness",
        "description": "Age must not be null",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
    },
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "row_dq",
        "rule": "age_adult",
        "column_name": "age",
        "expectation": "age < 20",
        "action_if_failed": "ignore",
        "tag": "validity",
        "description": "Age must be at less than 20",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
    },
        {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "row_dq",
        "rule": "email_not_null",
        "column_name": "email",
        "expectation": "email IS NOT NULL",
        "action_if_failed": "warn",
        "tag": "completeness",
        "description": "Email must not be null",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
    }

    
]
rules_df = spark.createDataFrame(pd.DataFrame(rules_data))
rules_df.show(truncate=True)
rules_df.write.mode("overwrite").saveAsTable(CONFIG['rules_table'])

display(rules_df)

In [0]:
## 3. Run Spark Expectations

from pyspark.sql import DataFrame

from spark_expectations.core import load_configurations
from spark_expectations.config.user_config import Constants as user_config

from spark_expectations.core.expectations import (
    SparkExpectations,
    WrappedDataFrameWriter,
)


writer = WrappedDataFrameWriter().mode("append").format("delta")


# Initialize Default Config 
load_configurations(spark) 
# Custom config (example enable slack/email notifications)
stats_streaming_config_dict = {user_config.se_enable_streaming: False}
notification_conf = {}

"""
This class implements/supports running the data quality rules on a dataframe returned by a function

Args:
    product_id: Name of the product
    rules_df: DataFrame which contains the rules. User is responsible for reading
        the rules_table in which ever system it is
    stats_table: Name of the table where the stats/audit-info need to be written
    debugger: Mark it as "True" if the debugger mode need to be enabled, by default is False
    stats_streaming_options: Provide options to override the defaults, while writing into the stats streaming table
"""
se = SparkExpectations(
    product_id=CONFIG["product_id"],
    rules_df=rules_df,
    stats_table=CONFIG["stats_table"],
    stats_table_writer=writer,
    target_and_error_table_writer=writer,
    stats_streaming_options=stats_streaming_config_dict,
)

#  Initialize input data
data = [
    {"id": 1, "age": 19,   "email": "alice@example.com"},
    {"id": 2, "age": 17,   "email": "bob@example.com"},
    {"id": 3, "age": None, "email": "charlie@example.com"},
    {"id": 4, "age": 40,   "email": "mike@example.com"},
    {"id": 5, "age": None, "email": "ron@example.com"},
    {"id": 6, "age": 35,   "email": None},
]
input_df = spark.createDataFrame(pd.DataFrame(data))
input_df.show()

"""
This decorator helps to wrap a function which returns dataframe and apply dataframe rules on it

Args:
    target_table: Name of the table where the final dataframe need to be written
    write_to_table: Mark it as "True" if the dataframe need to be written as table
    write_to_temp_table: Mark it as "True" if the input dataframe need to be written to the temp table to break
                        the spark plan
    user_conf: Provide options to override the defaults, while writing into the stats streaming table
    target_table_view: This view is created after the _row_dq process to run the target agg_dq and query_dq.
        If value is not provided, defaulted to {target_table}_view
    target_and_error_table_writer: Provide the writer to write the target and error table,
        this will take precedence over the class level writer

Returns:
    Any: Returns a function which applied the expectations on dataset
"""


@se.with_expectations(
    target_table=CONFIG["target_table"],
    write_to_table=True,
    write_to_temp_table=True,
    user_conf=notification_conf,
)
def get_dataset():
    _df_source: DataFrame = input_df
    _df_source.createOrReplaceTempView(CONFIG["in_memory_source"])
    return _df_source


# This will run the DQ checks and raise if any "fail" rules are violated
get_dataset()

# Results

Checkout `Stats` and `Error` table to see spark-expectations execution results

In [0]:
query_stats_table = f"SELECT * FROM {CONFIG['stats_table']}"
display(spark.sql(query_stats_table))

query_error_table = f"SELECT * FROM {CONFIG['target_table']}_error"
display(spark.sql(query_error_table))


# Display Target Table
Based on the provided rules if `action_if_failed` is set to `drop` target table will remove any rows not passing provided check

In [0]:
query_target_table = f"""
SELECT *
FROM {CONFIG['target_table']} 
ORDER BY meta_dq_run_id, id
"""

final_data_set_df = spark.sql(query_target_table)

if final_data_set_df is not None:
    display(final_data_set_df)

## Display Removed rows

In [0]:
input_count = spark.sql(f"SELECT COUNT(*) AS count FROM {CONFIG['in_memory_source']}").collect()[0]['count']
output_count = spark.sql(f"SELECT COUNT(*) AS count FROM {CONFIG['target_table']}").collect()[0]['count']

# Find missing rows in target_table that are present in in_memory_source
removed_rows_df = spark.sql(f"""
SELECT s.*
FROM {CONFIG['in_memory_source']} s
LEFT ANTI JOIN {CONFIG['target_table']} t
ON s.id = t.id
""")

removed_rows_count = removed_rows_df.count()

comparison_df = spark.createDataFrame(
    [
        ("input", input_count),
        ("output", output_count),
        ("removed_records", removed_rows_count)
    ],
    ["table", "record_count"]
)

display(comparison_df)

if removed_rows_count > 0:
    display(removed_rows_df)

# Cleanup 
- Post Execution removal of tables,views

In [0]:
# db_name = f"{CONFIG['catalog']}.{CONFIG['schema']}"
# pattern = f"se_{CONFIG['user']}*"

# # Set the current catalog
# spark.sql(f"USE CATALOG {CONFIG['catalog']}")

# # Drop tables matching pattern
# tables_df = spark.sql(f"SHOW TABLES IN {db_name} LIKE '{pattern}'")
# tables_to_drop = [row for row in tables_df.collect() if not row["isTemporary"] ]

# if tables_to_drop:
#     print(f"Found {len(tables_to_drop)} tables to drop.")
#     for row in tables_to_drop:
#         table_name = row["tableName"]
#         spark.sql(f"DROP TABLE IF EXISTS {db_name}.{table_name}")
#         print(f"Dropped table: {db_name}.{table_name}")
# else:
#     print("----- No tables to drop")

# # Drop global and local temp views matching pattern

# views_df = spark.sql(f"SHOW VIEWS in {db_name} LIKE '{pattern}'")
# views_to_drop = views_df.collect()

# if views_to_drop:
#     print(f"Found {len(views_to_drop)} views to drop.")
#     for row in views_to_drop:
#         view_name = row["viewName"]
#         spark.sql(f"DROP VIEW IF EXISTS {view_name}")
#         print(f"Dropped view: {view_name}")
# else:
#     print("----- No views to drop")