In [None]:
%sh
# clear the delta checkpoint
rm -rf /dbfs/tmp/StreamingDataQuality/checkpoint

# download some generated stock tick data; this is a public Mockaroo endpoint- as such, we can't guarantee availability!
curl "https://api.mockaroo.com/api/2aedaa80?count=1000&key=8eb06b50" > /tmp/stockTicks.json


In [None]:
dbutils.fs.mv("file:/tmp/stockTicks.json", "dbfs:/tmp/StreamingDataQuality/stockTicks.json")

In [None]:
spark.read.json("/tmp/StreamingDataQuality/stockTicks.json").repartition(100).write.mode("overwrite").parquet("/tmp/StreamingDataQuality/source/")

In [None]:
%fs ls /tmp/StreamingDataQuality/source/

path,name,size
dbfs:/tmp/StreamingDataQuality/source/_SUCCESS,_SUCCESS,0
dbfs:/tmp/StreamingDataQuality/source/_committed_6528151237972479421,_committed_6528151237972479421,9918
dbfs:/tmp/StreamingDataQuality/source/_started_6528151237972479421,_started_6528151237972479421,0
dbfs:/tmp/StreamingDataQuality/source/part-00000-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-2-1-c000.snappy.parquet,part-00000-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-2-1-c000.snappy.parquet,2758
dbfs:/tmp/StreamingDataQuality/source/part-00001-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-3-1-c000.snappy.parquet,part-00001-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-3-1-c000.snappy.parquet,2770
dbfs:/tmp/StreamingDataQuality/source/part-00002-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-4-1-c000.snappy.parquet,part-00002-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-4-1-c000.snappy.parquet,2725
dbfs:/tmp/StreamingDataQuality/source/part-00003-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-5-1-c000.snappy.parquet,part-00003-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-5-1-c000.snappy.parquet,2754
dbfs:/tmp/StreamingDataQuality/source/part-00004-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-6-1-c000.snappy.parquet,part-00004-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-6-1-c000.snappy.parquet,2782
dbfs:/tmp/StreamingDataQuality/source/part-00005-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-7-1-c000.snappy.parquet,part-00005-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-7-1-c000.snappy.parquet,2761
dbfs:/tmp/StreamingDataQuality/source/part-00006-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-8-1-c000.snappy.parquet,part-00006-tid-6528151237972479421-7b3266ae-77e1-4333-ba8b-c7d75d1750d2-8-1-c000.snappy.parquet,2750


In [None]:
%sh
# clear the delta checkpoint
rm -rf /dbfs/tmp/StreamingDataQuality/checkpoint


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType


inputPath = "/tmp/StreamingDataQuality/source/"

# Define the schema to speed up processing
jsonSchema = StructType([
    StructField('buysell', StringType(), True),
    StructField('date', StringType(), True),
    StructField('ipaddr', StringType(), True),
    StructField('ordertype', StringType(), True),
    StructField('price', DoubleType(), True),
    StructField('quantity', DoubleType(), True),
    StructField('symbol', StringType(), True),
    StructField('time', StringType(), True)
])

streamingInputDF = (
  spark
    .readStream
    .schema(jsonSchema)               # Set the schema of the JSON data
    .option("maxFilesPerTrigger", 1)  # Treat a sequence of files as a stream by picking one file at a time
    .parquet(inputPath)
)

streamingCountsDF = (
  streamingInputDF
    .groupBy(streamingInputDF.date)
    .count()
)

query = (
  streamingCountsDF
    .writeStream
    .format("memory")        # memory = store in-memory table (for testing only)
    .queryName("counts")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)

In [None]:
%pip install git+https://github.com/tdoehmen/duckdq

In [None]:
from duckdq.checks import Check, CheckLevel


In [None]:
%scala
import spark.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.concat

val data_path = "/tmp/StreamingDataQuality/source/"
val checkpoint_path = "/tmp/StreamingDataQuality/checkpoint/"
val base_df = spark.read.parquet(data_path)
val empty_df = base_df.where("0 = 1")
val l1: Long = 0

spark.sql("DROP TABLE IF EXISTS trades_delta")
spark.sql("DROP TABLE IF EXISTS bad_records")
spark.sql("DROP TABLE IF EXISTS duck_metrics")

base_df.createOrReplaceTempView("trades_historical")
empty_df.write.format("delta").saveAsTable("trades_delta")
empty_df.withColumn("batchID",lit(l1)).write.format("delta").saveAsTable("bad_records")
dbutils.fs.mkdirs(checkpoint_path)

In [None]:
%sh
# clear the delta checkpoint
rm -rf /dbfs/tmp/StreamingDataQuality/checkpoint


In [None]:
(spark
 .readStream
 .schema(jsonSchema) 
 .format("parquet")
 .option("maxFilesPerTrigger", 1)
 .load("/tmp/StreamingDataQuality/source/")
 .writeStream
 .format("delta")
 .option("failOnDataLoss", "false")
 .option("checkpointLocation", "/tmp/StreamingDataQuality/checkpoint/")
 .toTable("trades_delta"))

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
from duckdq.checks import Check, CheckLevel
from duckdq.verification_suite import VerificationResult, VerificationSuite
from pyspark.sql.types import LongType

# Initialize SparkSession
spark = SparkSession.builder.appName("StreamingDataQuality").getOrCreate()

# Read the delta table as a streaming source
trades_delta_stream = spark.readStream \
    .format("delta") \
    .table("trades_delta")

# Process each batch of data
def process_batch(batchDF, batchId):

    df = batchDF.toPandas()
    verificationResult = (
      VerificationSuite()
        .on_data(df)
        .add_check(
        Check(CheckLevel.WARNING, "Basic Check 2")
        .is_complete("ipaddr")
        .is_complete("quantity")
        .is_complete("price")
        .has_max("quantity", lambda mx: mx <= 10000)
        .is_non_negative("quantity")
            )
            .run()
    )

    data = [] 

    for check, check_result in verificationResult.check_results.items():
        check_description = check.description
        for constraint_result in check_result.constraint_results:
            constraint_s = constraint_result.constraint.__str__()
            constraint_result_s = constraint_result.status.name
            metric_value = constraint_result.metric.value
            metric_s = metric_value.get() if metric_value.isSuccess else "Error retrieving metric"
            data.append({
                "Check": check_description,
                "Constraint": constraint_s,
                "Status": constraint_result_s,
                "Metric": metric_s
            })


    schema = StructType([
        StructField("Check", StringType(), True),
        StructField("Constraint", StringType(), True),
        StructField("Status", StringType(), True),
        StructField("Metric", StringType(), True)  
    ])

    batchDF = batchDF.withColumn("batchID", lit(batchId).cast(LongType()))
    ver_df = spark.createDataFrame(data, schema=schema)

    # If verification fails, write batch to bad records table
    if verificationResult.status != "Success":
        batchDF = batchDF.withColumn("batchID", lit(batchId).cast(LongType()))
        batchDF.write.format("delta").mode("append").saveAsTable("bad_records")

    metric_results = ver_df 
    metric_results.withColumn("ts", current_timestamp()) \
        .write.format("delta").mode("append").saveAsTable("duck_metrics")

# Applying the foreachBatch function
query = trades_delta_stream.writeStream.foreachBatch(process_batch).start()



In [None]:
%scala
display(spark.readStream.format("delta")
        .option("ignoreChanges", "true")
        .table("duck_metrics"))

Check,Constraint,Status,Metric,ts
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:09:39.855+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:11:18.677+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:10:27.335+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:11:03.232+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:11:34.234+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:09:11.713+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.85,2024-04-09T05:09:35.564+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:09:56.237+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:09:25.423+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.95,2024-04-09T05:09:15.756+0000


In [None]:
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import current_timestamp
import pandas as pd
from pyspark.sql.functions import col

df = spark.table("duck_metrics")
df = df.withColumn("Metric", col("Metric").cast(DoubleType()))
df_filtered = df.filter(col("Constraint").startswith("CompletenessConstraint"))
display(df_filtered)


Check,Constraint,Status,Metric,ts
Basic Check 2,CompletenessConstraint(Completeness(price)),FAILURE,0.9666666666666668,2024-04-09T05:32:47.410+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:31:58.023+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:32:51.730+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:33:21.859+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:33:11.825+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:31:16.362+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.85,2024-04-09T05:31:34.234+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:31:12.306+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:33:02.140+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.95,2024-04-09T05:32:41.513+0000


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.