In [1]:
import sys, os

# Enable importing pysparky
sys.path.append(os.pardir)

In [2]:
import pyspark
from pyspark.sql import SparkSession, DataFrame, Column
from pyspark.sql import functions as F, types as T

print(pyspark.__version__)

spark = SparkSession.builder.getOrCreate()

3.5.2


24/10/21 21:28:44 WARN Utils: Your hostname, codespaces-0aafae resolves to a loopback address: 127.0.0.1; using 10.0.10.187 instead (on interface eth0)
24/10/21 21:28:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/21 21:28:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pysparky import spark_ext as spark_
from pysparky import functions as F_
from pysparky import enabler

from pysparky import transformations as te

In [4]:
import pandas as pd

data_pdf = pd.read_csv(
    "https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv",
    names=["first_name", "last_name", "address", "region", "code", "postcode"],
)
data_sdf = spark.createDataFrame(data_pdf)

In [5]:
data_sdf.show()

                                                                                

+--------------------+---------+--------------------+-----------+----+--------+
|          first_name|last_name|             address|     region|code|postcode|
+--------------------+---------+--------------------+-----------+----+--------+
|                John|      Doe|   120 jefferson st.|  Riverside|  NJ|    8075|
|                Jack| McGinnis|        220 hobo Av.|      Phila|  PA|    9119|
|       John "Da Man"|   Repici|   120 Jefferson St.|  Riverside|  NJ|    8075|
|             Stephen|    Tyler|7452 Terrace "At ...|   SomeTown|  SD|   91234|
|                 NaN| Blankman|                 NaN|   SomeTown|  SD|     298|
|Joan "the bone", ...|      Jet| 9th, at Terrace plc|Desert City|  CO|     123|
+--------------------+---------+--------------------+-----------+----+--------+



In [6]:
from pysparky.data_validator import DataValidator, ValidationRule

In [7]:
ValidationRules = [
    ValidationRule("first_name_check", F_.printable_only("first_name")),
    ValidationRule("last_name_check", F_.printable_only("last_name")),
    ValidationRule("address_check", F_.printable_only("address")),
    ValidationRule("region_check", F_.printable_only("region")),
    ValidationRule("code_check", [F_.two_character_only("code")]),
    ValidationRule("postcode_check", F_.printable_only("postcode")),
]

validator = DataValidator(ValidationRules)
validator.apply_conditions(data_sdf).show()
validator.filter_invalid(data_sdf).show()
validator.filter_valid(data_sdf).select(data_sdf.columns).show()

data_sdf.withColumns(validator.query_map).show()

+--------------------+---------+--------------------+-----------+----+--------+----------------+---------------+-------------+------------+----------+--------------+
|          first_name|last_name|             address|     region|code|postcode|first_name_check|last_name_check|address_check|region_check|code_check|postcode_check|
+--------------------+---------+--------------------+-----------+----+--------+----------------+---------------+-------------+------------+----------+--------------+
|                John|      Doe|   120 jefferson st.|  Riverside|  NJ|    8075|            true|           true|         true|        true|     false|          true|
|                Jack| McGinnis|        220 hobo Av.|      Phila|  PA|    9119|            true|           true|         true|        true|     false|          true|
|       John "Da Man"|   Repici|   120 Jefferson St.|  Riverside|  NJ|    8075|            true|           true|         true|        true|     false|          true|
|   

In [8]:
conditions = {
    "first_name_check": F_.printable_only("first_name"),
    "last_name_check": F_.printable_only("last_name"),
    "address_check": F_.printable_only("address"),
    "region_check": F_.printable_only("region"),
    "code_check": [F_.two_character_only("code")],
    "postcode_check": F_.printable_only("postcode"),
}


validator = DataValidator.from_dict(conditions)
validator.apply_conditions(data_sdf).show()
validator.filter_invalid(data_sdf).show()
validator.filter_valid(data_sdf).select(data_sdf.columns).show()

data_sdf.withColumns(validator.query_map).show()

+--------------------+---------+--------------------+-----------+----+--------+----------------+---------------+-------------+------------+----------+--------------+
|          first_name|last_name|             address|     region|code|postcode|first_name_check|last_name_check|address_check|region_check|code_check|postcode_check|
+--------------------+---------+--------------------+-----------+----+--------+----------------+---------------+-------------+------------+----------+--------------+
|                John|      Doe|   120 jefferson st.|  Riverside|  NJ|    8075|            true|           true|         true|        true|     false|          true|
|                Jack| McGinnis|        220 hobo Av.|      Phila|  PA|    9119|            true|           true|         true|        true|     false|          true|
|       John "Da Man"|   Repici|   120 Jefferson St.|  Riverside|  NJ|    8075|            true|           true|         true|        true|     false|          true|
|   

24/10/21 21:29:03 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [14]:
data = [(1, "valid"), (2, "invalid"), (3, "valid")]
sdf = spark.createDataFrame(data, ["id", "status"])

# Define validation rules
rules_dict = {"status_check": [F.col("status") == "valid"]}
validator = DataValidator.from_dict(rules_dict)

# Check query map
query_map = validator.query_map
result_data = sdf.withColumns(query_map).collect()
expected_data = [(1, "valid", True), (2, "invalid", False), (3, "valid", True)]

assert result_data == expected_data
# assert "status_check" in query_map
# assert query_map["status_check"] == (F.col("status") == "valid")

[Row(id=1, status='valid', status_check=True),
 Row(id=2, status='invalid', status_check=False),
 Row(id=3, status='valid', status_check=True)]