In [0]:
%sh
# clear the delta checkpoint
rm -rf /dbfs/tmp/StreamingDataQuality/checkpoint

# download some generated stock tick data; this is a public Mockaroo endpoint- as such, we can't guarantee availability!
curl "https://api.mockaroo.com/api/2aedaa80?count=1000&key=8eb06b50" > /tmp/stockTicks.json


In [0]:
dbutils.fs.mv("file:/tmp/stockTicks.json", "dbfs:/tmp/StreamingDataQuality/stockTicks.json")

In [0]:
%pip install git+https://github.com/tdoehmen/duckdq

In [0]:
from duckdq.checks import Check, CheckLevel

In [0]:
%scala
import spark.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.concat

val data_path = "/tmp/StreamingDataQuality/source/"
val checkpoint_path = "/tmp/StreamingDataQuality/checkpoint/"
val base_df = spark.read.parquet(data_path)
val empty_df = base_df.where("0 = 1")
val l1: Long = 0

spark.sql("DROP TABLE IF EXISTS trades_delta")
spark.sql("DROP TABLE IF EXISTS bad_records")
spark.sql("DROP TABLE IF EXISTS duck_metrics")

base_df.createOrReplaceTempView("trades_historical")
empty_df.write.format("delta").saveAsTable("trades_delta")
empty_df.withColumn("batchID",lit(l1)).write.format("delta").saveAsTable("bad_records")
dbutils.fs.mkdirs(checkpoint_path)

In [0]:
import time
import pandas as pd
from duckdq.checks import Check, CheckLevel
from duckdq.verification_suite import VerificationSuite
from duckdq.verification_suite import VerificationResult, VerificationSuite
from duckdq.engines.state_engine import StateEngine
from duckdq.utils.analysis_runner import AnalyzerContext
from duckdq.utils.exceptions import StateMergingException
from duckdq.verification_suite import VerificationResult, VerificationSuite
import duckdb
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import lit, current_timestamp


df = spark.read.json("/tmp/StreamingDataQuality/stockTicks.json")
df = df.toPandas()

verificationResult = (
    VerificationSuite()
    .on_data(df)
    .add_check(
    Check(CheckLevel.WARNING, "Basic Check 2")
    .is_complete("ipaddr")
    .is_complete("quantity")
    .is_complete("price")
    .has_max("quantity", lambda mx: mx <= 10000)
    .is_non_negative("quantity")
        )
        .run()
)
  
data = []  # This will hold our extracted data

for check, check_result in verificationResult.check_results.items():
    check_description = check.description
    for constraint_result in check_result.constraint_results:
        constraint_s = constraint_result.constraint.__str__()
        constraint_result_s = constraint_result.status.name
        metric_value = constraint_result.metric.value
        # Assuming `metric_value.get()` retrieves the actual value and is the standard success path
        metric_s = metric_value.get() if metric_value.isSuccess else "Error retrieving metric"
        # Append extracted information as a dictionary to our data list
        data.append({
            "Check": check_description,
            "Constraint": constraint_s,
            "Status": constraint_result_s,
            "Metric": metric_s
        })

schema = StructType([
    StructField("Check", StringType(), True),
    StructField("Constraint", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Metric", StringType(), True)  # Use FloatType() or DoubleType() if applicable
])


ver_df = spark.createDataFrame(data, schema=schema)

# Assuming you have a way to convert your metrics result to a DataFrame
metric_results = ver_df # Your logic to convert analysis results to DataFrame
metric_results.withColumn("ts", current_timestamp()) \
    .write.format("delta").mode("append").saveAsTable("duck_metrics")




In [0]:
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import current_timestamp
import pandas as pd
from pyspark.sql.functions import col

df = spark.table("duck_metrics")
df = df.withColumn("Metric", col("Metric").cast(DoubleType()))
df_filtered = df.filter(col("Constraint").startswith("CompletenessConstraint"))
display(df_filtered)


Check,Constraint,Status,Metric,ts
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.952,2024-04-09T20:04:13.798+0000
Basic Check 2,CompletenessConstraint(Completeness(ipaddr)),FAILURE,0.965,2024-04-09T20:04:13.798+0000
Basic Check 2,CompletenessConstraint(Completeness(price)),FAILURE,0.986,2024-04-09T20:04:13.798+0000


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.