## THIS NOTEBOOK CONTAINS

_**Pipeline Runner**_

This notebook contains:
- The compiled eligibility pipeline logic (validation, standardization, rejection handling)
- The pipeline execution entry point

Running this notebook reads partner configurations from `partners.json`,
processes all enabled partners, and writes unified and reject datasets
to the Output directory.


In [0]:
def split_clean_and_rejects(df: DataFrame, cfg: dict):
    required_digits = int(cfg["validation"].get("phone_digits_required", 10))

    ext_ok = F.col("external_id").isNotNull() & (F.length(F.col("external_id")) > 0)
    dob_ok = F.col("dob").isNull() | F.col("dob_parsed").isNotNull()
    phone_ok = F.col("phone_digits").isNull() | (F.length(F.col("phone_digits")) == required_digits)

    reject_reason = (
        F.when(~ext_ok, "missing_external_id")
         .when(~dob_ok, "invalid_dob")
         .when(~phone_ok, "invalid_phone")
    )

    df = df.withColumn("reject_reason", reject_reason)

    phone_fmt = F.when(
        F.length("phone_digits") == required_digits,
        F.concat_ws(
            "-",
            F.substring("phone_digits", 1, 3),
            F.substring("phone_digits", 4, 3),
            F.substring("phone_digits", 7, 4),
        ),
    )

    clean = (
        df.filter(F.col("reject_reason").isNull())
          .select(
              "external_id", "first_name", "last_name",
              F.date_format("dob_parsed", "yyyy-MM-dd").alias("dob"),
              "email", phone_fmt.alias("phone"), "partner_code"
          )
    )

    rejects = df.filter(F.col("reject_reason").isNotNull())
    return clean, rejects

In [0]:
run_pipeline(
    spark=spark,
    config_path="/Volumes/workspace/default/eligibility_volume/Config/partners.json"
)
