# Data Generation

Creates the base Delta tables:
- **invoices_raw**: Python-generated invoice data (no descriptions)
- **invoices**: LLM-enhanced with descriptions and supplier info
- **categories**: Category hierarchy from config

In [2]:
%pip install uv
%uv pip install .
%restart_python

/Users/scott.mckean/Repos/spend_categorization/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/Users/scott.mckean/Repos/spend_categorization/.venv/bin/python: No module named uv
Note: you may need to restart the kernel to use updated packages.


UsageError: Line magic function `%restart_python` not found.


In [None]:
from src.utils import get_spark
from src.generate import load_generate_config

spark = get_spark()
config = load_generate_config()

In [3]:
import random
from datetime import timedelta, datetime
random.seed(42)

In [None]:
# Parse dates from config
start_date = datetime.strptime(config.start_date, "%Y-%m-%d")
end_date = datetime.strptime(config.end_date, "%Y-%m-%d")

In [None]:
def random_date():
    delta = end_date - start_date
    return start_date + timedelta(days=random.randint(0, delta.days))

def sample_level1():
    r = random.random()
    dist = config.distribution
    if r < dist["Direct"]:
        return "Direct"
    elif r < dist["Direct"] + dist["Indirect"]:
        return "Indirect"
    return "Non-Procureable"

def sample_category_triplet():
    l1 = sample_level1()
    l2 = random.choice(list(config.categories[l1].keys()))
    l3 = random.choice(config.categories[l1][l2])
    return l1, l2, l3

def sample_plant():
    return random.choice(config.plants)

def generate_order_id(date):
    year = date.year
    seq = random.randint(1, config.rows)
    suffix = ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=2))
    return f"ORD-{year}-{str(seq).zfill(5)}-{suffix}"

def generate_amount_and_price(l1):
    if l1 == "Non-Procureable":
        amount = 1
        unit_price = round(random.uniform(1000, 500000), 2)
    else:
        amount = random.randint(1, 100)
        unit_price = round(random.uniform(5, 5000), 2)
    return amount, unit_price, round(amount * unit_price, 2)

In [None]:
import pandas as pd

rows = []
for i in range(config.rows):
    date = random_date()
    l1, l2, l3 = sample_category_triplet()
    plant = sample_plant()
    amount, unit_price, total = generate_amount_and_price(l1)

    rows.append({
        "date": date.strftime("%Y-%m-%d"),
        "order_id": generate_order_id(date),
        "category_level_1": l1,
        "category_level_2": l2,
        "category_level_3": l3,
        "cost_centre": config.category_cost_centre_mapping[l2],
        "plant": plant["name"],
        "plant_id": plant["id"],
        "region": plant["region"],
        "amount": amount,
        "unit_price": unit_price,
        "total": total
    })

invoice_data = pd.DataFrame(rows, columns=config.python_columns)

In [9]:
invoice_data

Unnamed: 0,date,order_id,category_level_1,category_level_2,category_level_3,cost_centre,plant,plant_id,region,amount,unit_price,total
0,2025-10-16,ORD-2025-02287-TR,Direct,Blades & Hub Parts,Blade shear web,CC-300-Engineering,US-West Plant,PLANT-US-W,North America,70,439.26,30748.20
1,2025-03-08,ORD-2025-08280-PO,Direct,Components,Control PCB,CC-100-Production,US-West Plant,PLANT-US-W,North America,92,3251.17,299107.64
2,2025-07-12,ORD-2025-00107-TE,Direct,Bearings & Seals,Oil seal,CC-100-Production,Germany-North,PLANT-DE-N,Europe,55,1704.55,93750.25
3,2024-06-08,ORD-2024-06225-CW,Direct,Electrical Assemblies,Control cabinet,CC-100-Production,US-East Plant,PLANT-US-E,North America,78,1326.28,103449.84
4,2024-02-14,ORD-2024-01292-OV,Indirect,Events & Conferences,Conference registration,CC-500-Sales,Germany-South,PLANT-DE-S,Europe,80,4427.83,354226.40
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2024-10-01,ORD-2024-07585-LY,Direct,Blades & Hub Parts,Pitch bearing,CC-300-Engineering,Brazil Plant,PLANT-BR,South America,69,663.55,45784.95
9996,2024-12-17,ORD-2024-01192-HL,Direct,Packaging Materials,Export crate,CC-700-Logistics,US-West Plant,PLANT-US-W,North America,62,2379.93,147555.66
9997,2024-03-18,ORD-2024-03705-QJ,Indirect,Safety & PPE,Hearing protection,CC-200-Maintenance,Vietnam Plant,PLANT-VN,Asia,83,2206.96,183177.68
9998,2025-07-14,ORD-2025-05719-AH,Indirect,Temporary Labor / Contracting,Crane rental crew,CC-200-Maintenance,Germany-South,PLANT-DE-S,Europe,49,2481.28,121582.72


In [None]:
# Save invoices_raw
(
  spark.createDataFrame(invoice_data)
  .write.format("delta")
  .option("overwriteSchema", "true")
  .mode("overwrite")
  .saveAsTable(config.full_invoices_raw)
)
print(f"Created {config.full_invoices_raw}")

In [0]:
# Widgets for SQL cells
dbutils.widgets.text("llm_endpoint", config.llm_endpoint)
dbutils.widgets.text("catalog", config.catalog)
dbutils.widgets.text("schema", config.schema_name)
dbutils.widgets.text("invoices_raw", config.invoices_raw_table)
dbutils.widgets.text("invoices", config.invoices_table)
dbutils.widgets.text("prompt", config.prompt)

In [0]:
%sql
-- Create invoices with LLM-generated descriptions
CREATE OR REPLACE TABLE $catalog.$schema.$invoices AS
SELECT
  t.*,
  AI_QUERY(
    :llm_endpoint,
    CONCAT(:prompt, 'category: ', category_level_2, ', plant: ', plant, ', region: ', region),
    responseFormat => '{"type":"json_schema","json_schema":{"name":"inv","schema":{"type":"object","properties":{"description":{"type":"string"},"supplier":{"type":"string"},"supplier_country":{"type":"string"}}}}}'
  ):description AS description,
  AI_QUERY(
    :llm_endpoint,
    CONCAT(:prompt, 'category: ', category_level_2, ', plant: ', plant, ', region: ', region),
    responseFormat => '{"type":"json_schema","json_schema":{"name":"inv","schema":{"type":"object","properties":{"description":{"type":"string"},"supplier":{"type":"string"},"supplier_country":{"type":"string"}}}}}'
  ):supplier AS supplier,
  AI_QUERY(
    :llm_endpoint,
    CONCAT(:prompt, 'category: ', category_level_2, ', plant: ', plant, ', region: ', region),
    responseFormat => '{"type":"json_schema","json_schema":{"name":"inv","schema":{"type":"object","properties":{"description":{"type":"string"},"supplier":{"type":"string"},"supplier_country":{"type":"string"}}}}}'
  ):supplier_country AS supplier_country
FROM $catalog.$schema.$invoices_raw t

In [None]:
# Verify invoices table
spark.table(config.full_invoices).limit(5).toPandas()

Unnamed: 0,date,order_id,input,output
0,2024-04-14,ORD-2024-06913-AC,Generate these fields based on the provided in...,"{""description"":""Annual sales conference regist..."


In [0]:
# Create categories table from config
categories_rows = []
for l1, l2_dict in config.categories.items():
    for l2, l3_list in l2_dict.items():
        for l3 in l3_list:
            categories_rows.append({
                "category_level_1": l1,
                "category_level_2": l2,
                "category_level_3": l3,
                "cost_centre": config.category_cost_centre_mapping.get(l2, ""),
            })

categories_df = spark.createDataFrame(categories_rows)
categories_df.write.format("delta").mode("overwrite").saveAsTable(config.full_categories)
print(f"Created {config.full_categories} with {len(categories_rows)} rows")

In [0]:
# Summary
print("Tables created:")
print(f"  {config.full_invoices_raw} - {config.rows} rows")
print(f"  {config.full_invoices} - with LLM descriptions")
print(f"  {config.full_categories} - {len(categories_rows)} category mappings")