In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
import json

# ---------- 1. Get pr_id parameter ----------
dbutils.widgets.text("pr_id", "local_dev")  # default for manual testing
pr_id = dbutils.widgets.get("pr_id")

assert pr_id, "pr_id is required"

print(f"Initializing test data for pr_id = '{pr_id}'")

# ---------- 2. Build raw DB name ----------
if pr_id == "prod":
    raw_db_name = "raw"
else:
    raw_db_name = f"{pr_id}_raw"

print(f"Using raw DB = {raw_db_name}")

# Make sure the database (schema) exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {raw_db_name}")

# ---------- 3. Load test dataset from JSON file in the repo ----------

# e.g. /Workspace/Repos/you/de-lab-databricks/notebooks/tests/init_test_notebook
input_path = "/Workspace/Repos/radomir@elfak.rs/de-lab-databricks/tests/input/orders_input.json"
print(f"Reading test data from: {input_path}")

with open(input_path, "r") as f:
    data = json.load(f)

print("Raw Python data loaded from JSON:")
print(data)

# 2) Define schema
schema = T.StructType([
    T.StructField("order_id", T.StringType(), False),
    T.StructField("customer_id", T.StringType(), False),
    T.StructField("amount", T.DoubleType(), False),
    T.StructField("created_at", T.StringType(), False),  # string first
])

# 3) Create DataFrame and convert created_at to timestamp
df_raw = (
    spark.createDataFrame(data, schema)
         .withColumn("created_at", F.to_timestamp("created_at"))
)

print("Test raw DataFrame:")
display(df_raw)

# ---------- 4. Write to raw table ----------
raw_table = f"{raw_db_name}.orders_raw"
print(f"Writing test data to raw table: {raw_table}")

(
    df_raw
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(raw_table)
)

print("Raw test data initialized âœ…")
