# Data Generation

Creates the base Delta tables:
- **invoices_raw**: Python-generated invoice data (no descriptions)
- **invoices**: LLM-enhanced with descriptions and supplier info
- **categories**: Category hierarchy from config

If you are using this accelerator on your own data, the **invoices** and **categories** tables drive the entire workflow. These can be ingested in numerous ways, but this accelerator assumes that they are either generated using 0_generate or manually uploaded. This could also be done via a synced ingestion job with Lakeflow Connect to SAP.

Tested on Serverless v4.

In [0]:
%pip install uv
%uv pip install .
%restart_python

In [0]:
from src.utils import get_spark
from src.config import load_config
import random
from datetime import timedelta, datetime

random.seed(42)
spark = get_spark()
config = load_config()

In [0]:
# Parse dates from config
start_date = datetime.strptime(config.start_date, "%Y-%m-%d")
end_date = datetime.strptime(config.end_date, "%Y-%m-%d")

In [0]:
def random_date():
    delta = end_date - start_date
    return start_date + timedelta(days=random.randint(0, delta.days))

def sample_level1():
    r = random.random()
    dist = config.distribution
    if r < dist["Direct"]:
        return "Direct"
    elif r < dist["Direct"] + dist["Indirect"]:
        return "Indirect"
    return "Non-Procureable"

def sample_category_triplet():
    l1 = sample_level1()
    l2 = random.choice(list(config.categories[l1].keys()))
    l3 = random.choice(config.categories[l1][l2])
    return l1, l2, l3

def sample_plant():
    return random.choice(config.plants)

def generate_order_id(date):
    year = date.year
    seq = random.randint(1, config.rows)
    suffix = ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=2))
    return f"ORD-{year}-{str(seq).zfill(5)}-{suffix}"

def generate_amount_and_price(l1):
    if l1 == "Non-Procureable":
        amount = 1
        unit_price = round(random.uniform(1000, 500000), 2)
    else:
        amount = random.randint(1, 100)
        unit_price = round(random.uniform(5, 5000), 2)
    return amount, unit_price, round(amount * unit_price, 2)

In [0]:
import pandas as pd

rows = []
for i in range(config.rows):
    date = random_date()
    l1, l2, l3 = sample_category_triplet()
    plant = sample_plant()
    amount, unit_price, total = generate_amount_and_price(l1)

    rows.append({
        "date": date.strftime("%Y-%m-%d"),
        "order_id": generate_order_id(date),
        "category_level_1": l1,
        "category_level_2": l2,
        "category_level_3": l3,
        "cost_centre": config.category_cost_centre_mapping[l2],
        "plant": plant["name"],
        "plant_id": plant["id"],
        "region": plant["region"],
        "amount": amount,
        "unit_price": unit_price,
        "total": total
    })

invoice_data = pd.DataFrame(rows, columns=config.python_columns)

In [0]:
# Save invoices_raw
(
  spark.createDataFrame(invoice_data)
  .write.format("delta")
  .option("overwriteSchema", "true")
  .mode("overwrite")
  .saveAsTable(config.full_invoices_raw_table_path)
)
print(f"Created {config.full_invoices_raw_table_path}")

In [0]:
# Widgets for SQL cells
dbutils.widgets.removeAll()
dbutils.widgets.text("llm_endpoint", config.small_llm_endpoint)
dbutils.widgets.text("catalog", config.catalog)
dbutils.widgets.text("schema", config.schema_name)
dbutils.widgets.text("invoices_raw", config.invoices_raw)
dbutils.widgets.text("invoices", config.invoices)
dbutils.widgets.text("prompt", config.prompt)

In [0]:
%sql  
-- Create invoices with LLM-generated descriptions
CREATE OR REPLACE TABLE IDENTIFIER(:catalog || '.' || :schema || '.' || :invoices)
AS
SELECT
  *,
  ai_result.description AS description,
  ai_result.supplier AS supplier,
  ai_result.supplier_country AS supplier_country
FROM (
  SELECT
    *,
    FROM_JSON(
      AI_QUERY(
        :llm_endpoint,
        CONCAT(:prompt, 'category: ', category_level_2, ', plant: ', plant, ', region: ', region),
        responseFormat => 'STRUCT<result:STRUCT<description:STRING, supplier:STRING, supplier_country:STRING>>',
        failOnError => false
      ).result,
      'STRUCT<description:STRING, supplier:STRING, supplier_country:STRING>'
    ) AS ai_result
  FROM IDENTIFIER(:catalog || '.' || :schema || '.' || :invoices_raw)
)

In [0]:
# Verify invoices table
spark.table(config.full_invoices_table_path).limit(5).toPandas()

In [0]:
# Create categories table from CSV
import pandas as pd
csv_path = config.categories_file
categories_df_pd = pd.read_csv(csv_path)
categories_df = spark.createDataFrame(categories_df_pd)
categories_df.write.format("delta").mode("overwrite").saveAsTable(config.full_categories_table_path)
print(f"Created {config.full_categories_table_path} with {len(categories_df_pd)} rows")

In [0]:
# Summary
print("Tables created:")
print(f"  {config.full_invoices_raw_table_path} - {config.rows} rows")
print(f"  {config.full_invoices_table_path} - with LLM descriptions")
print(f"  {config.full_categories_table_path} - {len(categories_df_pd)} category mappings")