# Cleaning & PII Redaction

Detect and redact PII fields (SSNs, addresses) using regex + pretrained NER models.

In [None]:
import re
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

bronze_path = "abfss://bronze@ragstorage4122025.dfs.core.windows.net/contracts_bronze/"
silver_path = "abfss://silver@ragstorage4122025.dfs.core.windows.net/contracts_silver_delta/"

bronze_df = spark.read.format("delta").load(bronze_path)

# Convert binary to text
def extract_text(content):
    try: return content.decode("utf-8", errors="ignore")
    except: return ""

# Simple PII masking
def redact_pii(text):
    if not text: return text
    text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "[EMAIL_REDACTED]", text)
    text = re.sub(r"\b\d{3}-\d{2}-\d{4}\b", "[SSN_REDACTED]", text)
    text = re.sub(r"\b\d{10}\b", "[PHONE_REDACTED]", text)
    return text

extract_udf = udf(extract_text, StringType())
redact_udf = udf(redact_pii, StringType())

silver_df = bronze_df \
    .withColumn("text_raw", extract_udf(col("content"))) \
    .withColumn("text_clean", redact_udf(col("text_raw")))

# Write to Silver Delta
silver_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("overwrite") \
        .save(silver_path)

display(spark.read.format("delta").load(silver_path))
print("Silver Delta created successfully.")
