### Mechanism X structured streaming code

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number, col, current_timestamp, trim, lower, regexp_replace, expr, lit
from pyspark.sql.window import Window
import builtins

# Config
CATALOG = "raw"
SCHEMA = "staging"
TRANSACTIONS_TABLE = "googledrive.raw.transactions"
VOLUME_PATH = "/Volumes/raw/staging/staging_volume/staging_data/"
CHUNK_SIZE = 10000
CHECKPOINT_PATH = "/Volumes/raw/staging/checkpoints/mechanism_x_streaming/"

# Read source table as streaming (simulate new rows appended)
# If the source table is a Delta table with new data appending, this works well
df_stream = spark.readStream.format("delta").table(TRANSACTIONS_TABLE)

def process_batch(batch_df, batch_id):
    if batch_df.count() == 0:
        print(f"Batch {batch_id}: No new data to process.")
        return

    print(f"Batch {batch_id}: Processing {batch_df.count()} rows")

    # Add row number for chunking within batch
    window_spec = Window.orderBy("_line")
    batch_df = batch_df.withColumn("row_num", row_number().over(window_spec))

    total_rows = batch_df.count()
    start = 1
    while start <= total_rows:
        end = builtins.min(start + CHUNK_SIZE - 1, total_rows)

        chunk_df = batch_df.filter((col("row_num") >= start) & (col("row_num") <= end)).drop("row_num")

        # Clean & transform data
        chunk_df = chunk_df \
            .withColumn("merchant", trim(lower(regexp_replace(col("merchant"), "[^a-zA-Z0-9]", "")))) \
            .withColumn("customer", trim(lower(regexp_replace(col("customer"), "[^a-zA-Z0-9]", "")))) \
            .withColumn("category", trim(lower(regexp_replace(col("category"), "[^a-zA-Z0-9]", "")))) \
            .withColumn("gender", trim(lower(regexp_replace(col("gender"), "[^a-zA-Z]", "")))) \
            .withColumn("age", expr("try_cast(regexp_replace(age, '[^0-9]', '') AS int)")) \
            .withColumn("amount", expr("try_cast(regexp_replace(amount, '[^0-9.]', '') AS double)")) \
            .withColumn("ingestion_timestamp", current_timestamp()) \
            .dropna(subset=["merchant", "customer", "amount"])

        chunk_df = chunk_df.withColumn("event_time", current_timestamp())

        try:
            chunk_df.write.format("delta").mode("append").save(VOLUME_PATH)
            print(f"Batch {batch_id}: Wrote chunk rows {start} to {end}")
        except Exception as e:
            print(f"Batch {batch_id}: Error writing chunk rows {start} to {end}: {e}")
            break

        start += CHUNK_SIZE

# Start streaming query with foreachBatch to chunk & write batches
query = df_stream.writeStream \
    .foreachBatch(process_batch) \
    .outputMode("append") \
    .option("checkpointLocation", CHECKPOINT_PATH) \
    .trigger(processingTime="1 second") \
    .start()

query.awaitTermination()


### Create a flag table for Jobs

In [0]:
# %sql
# CREATE TABLE IF NOT EXISTS raw.staging.mechanism_flag (
#   status STRING
# );

# INSERT INTO raw.staging.mechanism_flag VALUES ('not_ready');


%md
### Mechanism X code for Workflows and Jobs

In [0]:
%python
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import builtins

# ────── Configuration ──────
CATALOG = "raw"
TRANSACTIONS_TABLE = "googledrive.raw.transactions"
CHUNK_OUTPUT_PATH = "/Volumes/raw/staging/staging_volume/staging_data/"
CHUNK_SIZE = 10000
CHECKPOINT_PATH = "/Volumes/raw/staging/checkpoints/mechanism_x_streaming/"
FLAG_TABLE = "raw.staging.mechanism_flag"

# ────── Read Stream ──────
df_stream = spark.readStream.format("delta").table(TRANSACTIONS_TABLE)

# ────── Batch Processing ──────
def process_batch(batch_df, batch_id):
    if batch_df.count() == 0:
        print(f"Batch {batch_id}: No new data.")
        return

    print(f"Batch {batch_id}: Processing {batch_df.count()} rows")

    window_spec = Window.orderBy("_line")
    batch_df = batch_df.withColumn("row_num", row_number().over(window_spec))

    total_rows = batch_df.count()
    start = 1
    wrote_flag = False

    while start <= total_rows:
        end = builtins.min(start + CHUNK_SIZE - 1, total_rows)

        chunk_df = batch_df.filter((col("row_num") >= start) & (col("row_num") <= end)).drop("row_num")

        # Clean columns
        chunk_df = chunk_df \
            .withColumn("merchant", trim(lower(regexp_replace(col("merchant"), "[^a-zA-Z0-9]", "")))) \
            .withColumn("customer", trim(lower(regexp_replace(col("customer"), "[^a-zA-Z0-9]", "")))) \
            .withColumn("category", trim(lower(regexp_replace(col("category"), "[^a-zA-Z0-9]", "")))) \
            .withColumn("gender", trim(lower(regexp_replace(col("gender"), "[^a-zA-Z]", "")))) \
            .withColumn("age", expr("try_cast(regexp_replace(age, '[^0-9]', '') AS int)")) \
            .withColumn("amount", expr("try_cast(regexp_replace(amount, '[^0-9.]', '') AS double)")) \
            .withColumn("event_time", current_timestamp()) \
            .dropna(subset=["merchant", "customer", "amount"])

        # Write chunk
        chunk_df.write.format("delta").mode("append").save(CHUNK_OUTPUT_PATH)
        print(f"✅ Batch {batch_id}: Wrote rows {start} to {end}")

        if not wrote_flag:
            # Check if the flag table exists before updating
            if spark._jsparkSession.catalog().tableExists(FLAG_TABLE):
                spark.sql(f"UPDATE {FLAG_TABLE} SET status = 'ready'")
                print("🚩 Flag updated in Delta table: status = 'ready'")
            else:
                print(f"🚩 Flag table {FLAG_TABLE} does not exist.")
            wrote_flag = True

        start += CHUNK_SIZE

# ────── Start Streaming ──────
query = df_stream.writeStream \
    .foreachBatch(process_batch) \
    .outputMode("append") \
    .option("checkpointLocation", CHECKPOINT_PATH) \
    .trigger(processingTime="1 second") \
    .start()

query.awaitTermination()

### Verifying the writes

In [0]:
df = spark.read.format("delta").load("/Volumes/raw/staging/staging_volume/staging_data/")
df.display()