In [1]:
import sys
sys.path.append("../tsumugi/proto/")

In [2]:
from pyspark.sql import SparkSession
import pandas as pd

In [3]:
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()

In [4]:
test_data = spark.createDataFrame(
    [
        ["foo", 1, 4],
        ["bar", 2, 6],
        ["baz", 3, None],
    ],
    schema="struct<a:string, b:int, c:int>"
)

In [5]:
test_data.show()

+---+---+----+
|  a|  b|   c|
+---+---+----+
|foo|  1|   4|
|bar|  2|   6|
|baz|  3|NULL|
+---+---+----+



In [6]:
from tsumugi.verification import VerificationSuite
from tsumugi.analyzers import Size, Minimum, Completeness, CustomSql, ConstraintBuilder
from tsumugi.checks import CheckBuilder

In [7]:
suite = (
    VerificationSuite.on_data(test_data)
    .add_check(
        CheckBuilder()
        .with_constraint(
            ConstraintBuilder()
            .for_analyzer(Size())
            .should_be_geq_than(3.0)
            .build()
        )
        .with_description("hasSize(lambda x: x >= 3)")
        .build()
    )
    .add_check(
        CheckBuilder()
        .with_constraint(
            ConstraintBuilder()
            .for_analyzer(Minimum(column="b"))
            .should_be_eq_to(0.0)
            .build()
        )
        .build()
    )
)

In [8]:
result = suite.run_with_spark_session(spark)

In [9]:
from dataclasses import asdict

In [10]:
pd.DataFrame.from_records([asdict(dt) for dt in result.check_results])

Unnamed: 0,level,check_description,constraint_message,metric_name,metric_instance,metric_entity,metric_value,status,constraint
0,Warning,hasSize(lambda x: x >= 3),,Size,*,Dataset,3.0,Success,SizeConstraint(Size(None))
1,Warning,,Value: 1.0 does not meet the constraint requir...,Minimum,b,Column,1.0,Failure,"MinimumConstraint(Minimum(b,None,Some(Analyzer..."


In [11]:
pd.DataFrame.from_records([asdict(dt) for dt in result.checks])

Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,hasSize(lambda x: x >= 3),Warning,Success,SizeConstraint(Size(None)),Success,
1,,Warning,Warning,"MinimumConstraint(Minimum(b,None,Some(Analyzer...",Failure,Value: 1.0 does not meet the constraint requir...


In [12]:
pd.DataFrame.from_records([asdict(dt) for dt in result.metrics])

Unnamed: 0,entity,instance,name,value
0,Dataset,*,Size,3.0
1,Column,b,Minimum,1.0
