In [None]:
%pip install git+https://github.com/tdoehmen/duckdq

In [None]:
from duckdq.checks import Check, CheckLevel

In [None]:
import time
import pandas as pd
from duckdq.checks import Check, CheckLevel
from duckdq.verification_suite import VerificationResult, VerificationSuite
from duckdq.engines.state_engine import StateEngine
from duckdq.utils.analysis_runner import AnalyzerContext
from duckdq.utils.exceptions import StateMergingException
from duckdq.verification_suite import VerificationResult, VerificationSuite
import duckdb
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import lit, current_timestamp


df = pd.read_json('/dbfs/FileStore/tables/stockTicks.json',  lines=True)

verificationResult = (
    VerificationSuite()
    .on_data(df)
    .add_check(
    Check(CheckLevel.WARNING, "Basic Check 2")
    .has_size(lambda mx: mx == 1000)

    .is_unique("buysell") 
    .is_unique("date") 
    .is_unique("ipaddr") 
    .is_unique("ordertype") 
    .is_unique("price") 
    .is_unique("quantity") 
    .is_unique("symbol") 
    .is_unique("time") 

    .is_complete("buysell") 
    .is_complete("date") 
    .is_complete("ipaddr") 
    .is_complete("ordertype") 
    .is_complete("price") 
    .is_complete("quantity") 
    .is_complete("symbol") 
    .is_complete("time") 

    .has_distinctness("buysell", lambda mx: mx > 0.1) 
    .has_distinctness("date", lambda mx: mx > 0.1) 
    .has_distinctness("ipaddr", lambda mx: mx > 0.1) 
    .has_distinctness("ordertype", lambda mx: mx > 0.1) 
    .has_distinctness("price", lambda mx: mx > 0.1) 
    .has_distinctness("quantity", lambda mx: mx > 0.1) 
    .has_distinctness("symbol", lambda mx: mx > 0.1) 
    .has_distinctness("time", lambda mx: mx > 0.1) 
    
    .has_approx_quantile("price", 0.5, lambda mx: mx < 40 )
    .has_approx_quantile("quantity", 0.5, lambda mx: mx < 2000 )

    .is_non_negative("price")
    .is_non_negative("quantity")

    )
    .run()
)


In [None]:
verificationResult

In [None]:
 
data = [] 

for check, check_result in verificationResult.check_results.items():
    check_description = check.description
    for constraint_result in check_result.constraint_results:
        constraint_s = constraint_result.constraint.__str__()
        constraint_result_s = constraint_result.status.name
        metric_value = constraint_result.metric.value
        metric_s = metric_value.get() if metric_value.isSuccess else "Error retrieving metric"
        data.append({
            "Check": check_description,
            "Constraint": constraint_s,
            "Status": constraint_result_s,
            "Metric": metric_s
        })

schema = StructType([
    StructField("Check", StringType(), True),
    StructField("Constraint", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Metric", StringType(), True)  
])

ver_df = spark.createDataFrame(data, schema=schema)

metric_results = ver_df 
metric_results.withColumn("ts", current_timestamp()).write.mode("overwrite").saveAsTable("duck_metrics")


In [None]:
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import current_timestamp
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType


df = spark.table("duck_metrics")
df = df.withColumn("Metric", col("Metric").cast(DoubleType()))
# df_filtered = df.filter(col("Constraint").startswith("CompletenessConstraint") | col("Constraint").startswith("DistinctnessConstraint"))
display(df)


Check,Constraint,Status,Metric,batchID,ts
Basic Check 2,DistinctnessConstraint(Distinctness(price)),SUCCESS,0.975,,2024-04-11T02:39:19.342+0000
Basic Check 2,DistinctnessConstraint(Distinctness(quantity)),SUCCESS,0.911,,2024-04-11T02:39:19.342+0000
Basic Check 2,DistinctnessConstraint(Distinctness(symbol)),SUCCESS,0.93,,2024-04-11T02:39:19.342+0000
Basic Check 2,DistinctnessConstraint(Distinctness(time)),SUCCESS,0.996,,2024-04-11T02:39:19.342+0000
Basic Check 2,QuantileConstraint(Quantile(price_0.5)),SUCCESS,20.08799934387207,,2024-04-11T02:39:19.342+0000
Basic Check 2,QuantileConstraint(Quantile(quantity_0.5)),SUCCESS,1910.9981689453125,,2024-04-11T02:39:19.342+0000
Basic Check 2,ComplianceConstraint(Compliance(price is non-negative)),SUCCESS,1.0,,2024-04-11T02:39:19.342+0000
Basic Check 2,ComplianceConstraint(Compliance(quantity is non-negative)),FAILURE,0.955,,2024-04-11T02:39:19.342+0000
Basic Check 2,CompletenessConstraint(Completeness(quantity)),FAILURE,0.954,,2024-04-11T02:39:19.342+0000
Basic Check 2,CompletenessConstraint(Completeness(symbol)),SUCCESS,1.0,,2024-04-11T02:39:19.342+0000


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.