In [1]:
import sys, os

# Enable importing pysparky
sys.path.append(os.pardir)

In [2]:
import pyspark
from pyspark.sql import SparkSession, DataFrame, Column
from pyspark.sql import functions as F, types as T

print(pyspark.__version__)

spark = SparkSession.builder.getOrCreate()

3.5.2


24/10/21 15:44:14 WARN Utils: Your hostname, codespaces-0aafae resolves to a loopback address: 127.0.0.1; using 10.0.10.95 instead (on interface eth0)
24/10/21 15:44:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/21 15:44:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pysparky import spark_ext as spark_
from pysparky import functions as F_
from pysparky import enabler

from pysparky import transformations as te

In [4]:
import pandas as pd

data_pdf = pd.read_csv(
    "https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv",
    names=["first_name", "last_name", "address", "region", "code", "postcode"],
)
data_sdf = spark.createDataFrame(data_pdf)

24/10/21 15:44:33 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [5]:
data_sdf.show()

                                                                                

+--------------------+---------+--------------------+-----------+----+--------+
|          first_name|last_name|             address|     region|code|postcode|
+--------------------+---------+--------------------+-----------+----+--------+
|                John|      Doe|   120 jefferson st.|  Riverside|  NJ|    8075|
|                Jack| McGinnis|        220 hobo Av.|      Phila|  PA|    9119|
|       John "Da Man"|   Repici|   120 Jefferson St.|  Riverside|  NJ|    8075|
|             Stephen|    Tyler|7452 Terrace "At ...|   SomeTown|  SD|   91234|
|                 NaN| Blankman|                 NaN|   SomeTown|  SD|     298|
|Joan "the bone", ...|      Jet| 9th, at Terrace plc|Desert City|  CO|     123|
+--------------------+---------+--------------------+-----------+----+--------+



In [36]:
from dataclasses import dataclass, field


@dataclass
class CheckField:
    check_name: str
    criteria: list[int]
    criteria_and: int = field(init=False)

    def __post_init__(self):
        self.criteria_and = F_.condition_and(*enabler.ensure_list(self.criteria))


@dataclass
class Criteria:
    checks: list[CheckField]

    @classmethod
    def fromDict(cls, data: dict[str, list[int]]):
        checks = [CheckField(name, criteria) for name, criteria in data.items()]
        return cls(checks=checks)  # Adjust sdf as needed

    @property
    def criteria_to_query(self):
        return {
            check_field.check_name: check_field.criteria_and
            for check_field in self.checks
        }

    def check_matrix(self, sdf):
        self.sdf = sdf
        return sdf.withColumns(self.criteria_to_query)

    def false_data(self, sdf):
        return te.filters(
            self.check_matrix(sdf),
            [
                # Either cases is False
                F.col(column_name) == False
                for column_name in self.criteria_to_query.keys()
            ],
            operator_="or",
        )

    def positive_data(self, sdf):
        return te.filters(
            self.check_matrix(sdf),
            [
                # All cases is True
                F.col(column_name) == True
                for column_name in self.criteria_to_query.keys()
            ],
            operator_="and",
        )

In [41]:
criteria = {
    "first_name_check": F_.printable_only("first_name"),
    "last_name_check": F_.printable_only("last_name"),
    "address_check": F_.printable_only("address"),
    "region_check": F_.printable_only("region"),
    "code_check": [F_.two_character_only("code")],
    "postcode_check": F_.printable_only("postcode"),
}

# print(Criteria.fromDict(criteria).criteria_to_query)
Criteria.fromDict(criteria).check_matrix(data_sdf).show()
Criteria.fromDict(criteria).false_data(data_sdf).show()
Criteria.fromDict(criteria).positive_data(data_sdf).select(data_sdf.columns).show()

# data_sdf.withColumns(Criteria.fromDict(criteria).criteria_to_query).show()

+--------------------+---------+--------------------+-----------+----+--------+----------------+---------------+-------------+------------+----------+--------------+
|          first_name|last_name|             address|     region|code|postcode|first_name_check|last_name_check|address_check|region_check|code_check|postcode_check|
+--------------------+---------+--------------------+-----------+----+--------+----------------+---------------+-------------+------------+----------+--------------+
|                John|      Doe|   120 jefferson st.|  Riverside|  NJ|    8075|            true|           true|         true|        true|     false|          true|
|                Jack| McGinnis|        220 hobo Av.|      Phila|  PA|    9119|            true|           true|         true|        true|     false|          true|
|       John "Da Man"|   Repici|   120 Jefferson St.|  Riverside|  NJ|    8075|            true|           true|         true|        true|     false|          true|
|   