In [0]:
dbutils.widgets.removeAll()

In [0]:
%restart_python

In [0]:
%python
# create secret scope for Databricks and store it (check afterwards if secret can be pulled)
from databricks.sdk import WorkspaceClient
from pyspark.errors import PySparkException
w = WorkspaceClient()
w.secrets.list_scopes()

scope="pagerduty_test"
# enter in your pagerduty integration key to use in notebook (or retrieve from workspace)
integration_key="your_integration_key"

try:
    w.secrets.create_scope(scope=scope)
except PySparkException as pyex:
    print(f"PysparkException: {pyex}")
except Exception as e:
    print(f"Exception: {e}")

w.secrets.put_secret(scope=scope, key=integration_key, string_value="pagerduty_secret_demo")

In [0]:
%python

# below code to create widgets of database, stats_table_name and table_name
dbutils.widgets.text("user", "user")
dbutils.widgets.text("git_org", "Nike-Inc") #name of widget is 'database', default value is 'demo_db'
dbutils.widgets.text("catalog", "development")
dbutils.widgets.text("schema", "default")
dbutils.widgets.text("library_source", "git")
dbutils.widgets.text("git_branch_or_commit", "main")
dbutils.widgets.text("override_version", "False")

# dbutils.widgets.dropdown("database", "default", [database[0] for database in spark.catalog.listDatabases()]) -- limit 1024
dbutils.widgets.text("in_memory_source", "se_rg_source")


In [0]:
import re
import pandas as pd

user = re.sub(r'[^a-zA-Z]', '', dbutils.widgets.get("user")).lower()
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
library = dbutils.widgets.get("library_source")
override_se_version = dbutils.widgets.get("override_version")
org = dbutils.widgets.get("git_org")
branch_or_commit = dbutils.widgets.get("git_branch_or_commit")
in_memory_source = dbutils.widgets.get("in_memory_source")


CONFIG = {
    "owner": user,
    "catalog": "development",
    "schema": schema,
    "user": user,
    "product_id": f"se_{user}_product",
    "in_memory_source": f"se_{user}_source",
    "rules_table": f"development.{schema}.se_{user}_rules",
    "stats_table": f"development.{schema}.se_{user}_stats",
    "target_table": f"development.{schema}.se_{user}_target",
    "override_se_version" : override_se_version,
    "library": library,
    "org": org,
    "branch_or_commit": branch_or_commit
}

config_df = pd.DataFrame(list(CONFIG.items()), columns=['Key', 'Value'])

In [0]:
display(config_df)

In [0]:
# Override Spark Expectations based on user input
if override_se_version:
    print("-----OVERRIDING SPARK-EXPECTATIONS VERSION")
    if CONFIG["library"] == "pypi":
      print("-----INSTALLING SPARK-EXPECTATIONS from PyPi")
      %pip install spark-expectations
    elif CONFIG["library"] == "git":
      print(f"-----INSTALLING SPARK-EXPECTATIONS from Git Org/User {CONFIG['org']}, Branch/Commit {CONFIG['branch_or_commit']}")
      giturl = f"git+https://github.com/{CONFIG['org']}/spark-expectations.git@{CONFIG['branch_or_commit']}"
      %pip install --force-reinstall {giturl}    
else:
    print(f"---- Using SparkExpectation from local codebase")

In [0]:
# CREATE SPARK SESSION AND DATABASE
from pyspark.sql import SparkSession

# Create or get a Spark session
spark = SparkSession.builder \
    .appName("Spark PagerDuty Integration") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .getOrCreate()

In [0]:
databases_df = spark.sql("SHOW DATABASES")
databases_df.show(truncate=False)

tables_df = spark.sql("SHOW TABLES")
tables_df.show(truncate=False)

In [0]:
import pandas as pd
from pyspark.sql import SparkSession


In [0]:
rules_data = [
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "row_dq",
        "rule": "age_not_null",
        "column_name": "age",
        "expectation": "age IS NOT NULL",
        "action_if_failed": "warn",
        "tag": "completeness",
        "description": "Age must not be null",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
        "priority": "low",
    },
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "row_dq",
        "rule": "age_adult",
        "column_name": "age",
        "expectation": "age < 20",
        "action_if_failed": "ignore",
        "tag": "validity",
        "description": "Age must be less than 20",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
        "priority": "medium",
    },
    {
        "product_id": CONFIG["product_id"],
        "table_name": CONFIG["target_table"],
        "rule_type": "row_dq",
        "rule": "email_not_null",
        "column_name": "email",
        "expectation": "email IS NOT NULL",
        "action_if_failed": "drop",
        "tag": "completeness",
        "description": "Email must not be null",
        "enable_for_source_dq_validation": True,
        "enable_for_target_dq_validation": True,
        "is_active": True,
        "enable_error_drop_alert": False,
        "error_drop_threshold": 0,
        "priority": "high",
    }
]

rules_df = spark.createDataFrame(rules_data)
rules_df.write.format("delta").mode("overwrite").saveAsTable(CONFIG['rules_table'])

In [0]:
rules_df.toPandas()

In [0]:
# Configure streaming and notification configuration
from spark_expectations.config.user_config import Constants as user_config
from dbruntime.databricks_repl_context import get_context

# This is a dictionary that can be used to configure Spark Expectations behavior and override default settings.
stats_streaming_config_dict = {
    user_config.se_enable_streaming: False,
}


user_conf_dict = {
    # pagerduty notifications flags
    user_config.se_notifications_enable_pagerduty: True,
    user_config.se_notifications_pagerduty_integration_key: integration_key,
    user_config.se_notifications_pagerduty_webhook_url: "https://events.pagerduty.com/v2/enqueue",

    # email flags
    user_config.se_notifications_enable_email: True,
    user_config.se_notifications_email_smtp_host: "smtp_host",
    user_config.se_notifications_email_smtp_port: 25,
    user_config.se_notifications_email_from: "from.email@nike.com",
    user_config.se_notifications_email_to_other_mail_id: "to.email@nike.com",
    user_config.se_notifications_email_subject: "SE notification integration testing",

    # slack flags
    user_config.se_notifications_enable_slack: True,
    user_config.se_notifications_slack_webhook_url: "slack_webhook", 
    user_config.se_notifications_min_priority_slack: "medium",

    # Notification triggers
    user_config.se_notifications_on_start: True,
    user_config.se_notifications_on_completion: True,
    user_config.se_notifications_on_fail: True,
    user_config.se_notifications_on_error_drop_exceeds_threshold_breach: True,
    user_config.se_notifications_on_rules_action_if_failed_set_ignore: True,
    user_config.se_notifications_on_error_drop_threshold: 1,
}

display(user_conf_dict)

In [0]:
%sh

pip install jinja2

In [0]:
from pyspark.sql import DataFrame

from spark_expectations.core import load_configurations

from spark_expectations.core.expectations import (
    SparkExpectations,
    WrappedDataFrameWriter,
)


In [0]:
writer = WrappedDataFrameWriter().mode("overwrite").format("delta")

In [0]:
load_configurations(spark) 

In [0]:
se = SparkExpectations(
    product_id=CONFIG["product_id"],
    rules_df=rules_df,
    stats_table=CONFIG["stats_table"],
    stats_table_writer=writer,
    target_and_error_table_writer=writer,
    stats_streaming_options=stats_streaming_config_dict,
)

In [0]:
#  Initialize input data
data = [
    {"id": 1, "age": 19,   "email": "alice@example.com"},
    {"id": 2, "age": 17,   "email": "bob@example.com"},
    {"id": 3, "age": None, "email": "charlie@example.com"},
    {"id": 4, "age": 40,   "email": "mike@example.com"},
    {"id": 5, "age": None, "email": "ron@example.com"},
    {"id": 6, "age": 35,   "email": None},
]
input_df = spark.createDataFrame(data)
input_df.show(truncate=False)

In [0]:
@se.with_expectations(
    target_table=CONFIG["target_table"],
    write_to_table=True,
    write_to_temp_table=True,
    user_conf=user_conf_dict,
)
def get_dataset():
    _df_source: DataFrame = input_df
    _df_source.createOrReplaceTempView(CONFIG["in_memory_source"])
    return _df_source

In [0]:
get_dataset()