In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, current_timestamp, trim, lower, regexp_replace, expr
import time
import builtins  # for min function fix

# Config
CATALOG = "testing"
SCHEMA = "processed_data"
TRANSACTIONS_TABLE = "transactions.financial_transactions_google_drive.transactions"
PROGRESS_TABLE = f"{CATALOG}.{SCHEMA}.chunk_progress"

# ✅ Use volume path instead of raw S3 bucket
VOLUME_PATH = "/Volumes/testing/processed_data/staging_volume/staging_data/"
CHUNK_SIZE = 10000

# Start Spark session
spark = SparkSession.builder.getOrCreate()

# Load full source table
print(f"Reading source table: {TRANSACTIONS_TABLE}")
df_full = spark.read.table(TRANSACTIONS_TABLE)
max_row = df_full.count()
print(f"Total rows in source table: {max_row}")

if max_row == 0:
    print("❌ Source table is empty. Exiting.")
    exit()

# Add row number column ordered by _line (adjust if needed)
df_full = df_full.withColumn(
    "row_num", 
    row_number().over(Window.partitionBy(lit(1)).orderBy(col("_line")))
)

# Determine start row based on progress
if spark.catalog.tableExists(PROGRESS_TABLE):
    last_row_processed = spark.read.table(PROGRESS_TABLE).agg({"last_row": "max"}).collect()[0][0]
    if last_row_processed >= max_row:
        print(f"Last processed row ({last_row_processed}) >= max rows ({max_row}), resetting start to 1")
        start = 1
    else:
        start = last_row_processed + 1
    print(f"Resuming from last processed row: {start}")
else:
    start = 1
    print("No progress table found, starting from row 1")

end = start + CHUNK_SIZE - 1

while start <= max_row:
    current_end = builtins.min(end, max_row)
    print(f"Processing chunk {start} to {current_end}")
    chunk_df = df_full.filter((col("row_num") >= start) & (col("row_num") <= current_end)).drop("row_num")

    # ✅ Clean & transform data
    chunk_df = chunk_df \
        .withColumn("merchant", trim(lower(regexp_replace(col("merchant"), "[^a-zA-Z0-9]", "")))) \
        .withColumn("customer", trim(lower(regexp_replace(col("customer"), "[^a-zA-Z0-9]", "")))) \
        .withColumn("category", trim(lower(regexp_replace(col("category"), "[^a-zA-Z0-9]", "")))) \
        .withColumn("gender", trim(lower(regexp_replace(col("gender"), "[^a-zA-Z]", "")))) \
        .withColumn("age", expr("try_cast(regexp_replace(age, '[^0-9]', '') AS int)")) \
        .withColumn("amount", expr("try_cast(regexp_replace(amount, '[^0-9.]', '') AS double)")) \
        .withColumn("ingestion_timestamp", current_timestamp()) \
        .dropna(subset=["merchant", "customer", "amount"])

    # Add event timestamp
    chunk_df = chunk_df.withColumn("event_time", current_timestamp())

    try:
        # ✅ Write using Unity Catalog volume path
        chunk_df.write.format("delta").mode("append").save(VOLUME_PATH)
        print(f"✅ Wrote chunk {start}-{current_end} at {time.strftime('%Y-%m-%d %H:%M:%S')} to volume path")

        # Save progress to tracking table
        progress_df = spark.createDataFrame([(current_end,)], ["last_row"])
        progress_df.write.mode("overwrite").saveAsTable(PROGRESS_TABLE)

    except Exception as e:
        print(f"❌ Error writing chunk {start}-{current_end} to volume path: {e}")
        break

    time.sleep(10)
    start += CHUNK_SIZE
    end += CHUNK_SIZE

print("\nDone writing chunks. Verifying written data sample:")
try:
    spark.read.format("delta").load(VOLUME_PATH).limit(10).show()
except Exception as e:
    print(f"❌ Error reading chunk output path: {e}")


In [0]:
df = spark.read.format("delta").load("/Volumes/testing/processed_data/staging_volume/staging_data/")
df.select("amount", "merchant", "customer", "category").summary().show()
