In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# ---------- 1. Get pr_id parameter ----------
dbutils.widgets.text("pr_id", "local_dev")  # default for manual testing
pr_id = dbutils.widgets.get("pr_id")

assert pr_id, "pr_id is required"

print(f"Initializing test data for pr_id = '{pr_id}'")

# ---------- 2. Build raw DB name ----------
if pr_id == "prod":
    raw_db_name = "raw"
else:
    raw_db_name = f"{pr_id}_raw"

print(f"Using raw DB = {raw_db_name}")

# Make sure the database (schema) exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {raw_db_name}")

# ---------- 3. Create a small test dataset (inline) ----------
data = [
    {"order_id": "o1", "customer_id": "c1", "amount": 100.0, "created_at": "2024-01-01T10:00:00"},
    {"order_id": "o2", "customer_id": "c2", "amount": 200.5, "created_at": "2024-02-15T15:30:00"},
    {"order_id": "o3", "customer_id": "c3", "amount": -50.0, "created_at": "2024-03-10T09:00:00"},  # will be filtered out
]

schema = T.StructType([
    T.StructField("order_id", T.StringType(), False),
    T.StructField("customer_id", T.StringType(), False),
    T.StructField("amount", T.DoubleType(), False),
    T.StructField("created_at", T.StringType(), False),  # load as string first
])

df_raw = spark.createDataFrame(data, schema).withColumn(
    "created_at",
    F.to_timestamp("created_at")
)


print("Test raw data:")
display(df_raw)

# ---------- 4. Write to raw table ----------
raw_table = f"{raw_db_name}.orders_raw"
print(f"Writing test data to raw table: {raw_table}")

(
    df_raw
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(raw_table)
)

print("Raw test data initialized âœ…")
