# Cleaning & PII Redaction

Detect and redact PII fields (SSNs, addresses) using regex + pretrained NER models.

In [None]:

from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
import re

def redact_pii(text):
    if text is None:
        return text
    text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "[EMAIL_REDACTED]", text)
    text = re.sub(r"\b\d-\d-\d\b", "[SSN_REDACTED]", text)
    text = re.sub(r"\b\d\b", "[PHONE_REDACTED]", text)
    text = re.sub(r"\b\d-\d-\d\b", "[PHONE_REDACTED]", text)
    return text

redact_udf = udf(redact_pii, StringType())

silver_path = "abfss://silver@ragstorage4122025.dfs.core.windows.net"

bronze_df = (
    spark.read
    .format("binaryFile")
    .option("recursiveFileLookup", "true")
    .load(silver_path)
)
silver_df = silver_df.withColumn(
    "text_clean",
    redact_udf(col("text_raw"))
)

(
    silver_df.write
    .format("delta")
    .mode("overwrite")
    .save(silver_path)
)