In [0]:
%pip install openpyxl
%restart_python

In [0]:
from pathlib import Path
from pyspark.sql import functions as F
import pandas as pd

In [0]:
category_file = 'SMS Categories - Dec 12 - 2025.xlsx'
update_tables = True
volume_root = '/Volumes/sandbox/sms_categorization/'
cat_path = Path(volume_root) / 'categories'
list(cat_path.glob('*.xlsx'))

In [0]:
df = pd.read_excel(cat_path / category_file, engine='openpyxl')

# Concatenate 'Category' and 'Description' columns with a separator (e.g., ' - ')
df['Category_Description'] = df['Category'] + ': ' + df['Description']

categories_str = ' '.join(df['Category_Description'].astype(str))
display(categories_str)

In [0]:
if update_tables:
  (
    spark.createDataFrame([(categories_str,)], ["categories_str"])
    .withColumn("file_name", F.lit(category_file))
    .withColumn("date", F.current_timestamp())
    .select('date','file_name','categories_str')
    .write.format('delta')
    .mode('append')
    .saveAsTable('sandbox.sms_categorization.categories')
  )

In [0]:
%sql
SELECT * FROM sandbox.sms_categorization.categories

In [0]:
prompt = """Use the following categories and accompanying description and classify the invoice below.

<Rules>
- If you cannot identify the category, return "Unknown / BadData" and rank the confidence as 1
- A confidence of 2 means you can classify it, but it could easily be multiple categories
- A confidence of 3 means it is likely a single category, but the rationale isn't clear
- A confidence of 4 means you are very confident in your classification with a clear rationale
- A confidence of 5 means you have an exact example in the context for the classification
</Rules>

Return a json output with the category, confidence of classification between 1 and 5, and rationale. You must use the categories provided.
"""

In [0]:
update_tables = Trueif update_tables:


  (
    spark.createDataFrame([(prompt,)], ["prompt"])
    .withColumn("date", F.current_timestamp())
    .select('date','prompt')
    .write.format('delta')
    .mode('append')
    .saveAsTable('sandbox.sms_categorization.prompts')
  )

In [0]:
%sql
SELECT * FROM sandbox.sms_categorization.prompts

In [0]:
%sql
CREATE OR REPLACE TABLE sandbox.sms_categorization.test_output AS
WITH latest_prompt AS (
  SELECT
    prompt,
    date
  FROM sandbox.sms_categorization.prompts
  ORDER BY date DESC
  LIMIT 1
),
latest_category AS (
  SELECT *
  FROM sandbox.sms_categorization.categories
  ORDER BY date DESC
  LIMIT 1
)
SELECT
  t.FactInvoiceCodingID,
  current_date() as inference_date,
  t.Vendor,
  t.LineDescription,
  t.ProjectCategoryDesc,
  CONCAT(
    lp.prompt, '\n',
    lc.categories_str, '\n',
    "##invoice \n",
    t.vendor, '\n',
    t.LineDescription, '\n'
    ) AS input,
  AI_QUERY(
    'databricks-claude-3-7-sonnet',
    CONCAT(
    lp.prompt, '\n',
    lc.categories_str, '\n',
    "##invoice \n",
    t.vendor, '\n',
    t.LineDescription, '\n'
    ),
    responseFormat => '{
      "type": "json_schema",
      "json_schema": {
        "name": "categorization",
        "schema": {
          "type": "object",
          "properties": {
            "category": {"type": "string"},
            "confidence": {"type": "number"},
            "rationale": {"type": "string"}
          }
        }
      }
    }'
  ) AS output
FROM sandbox.sms_categorization.test t
LEFT JOIN latest_prompt lp ON 1=1
LEFT JOIN latest_category lc ON 1=1

In [0]:
%sql
-- Download this
SELECT
  *,
  get_json_object(output, '$.category') AS category,
  get_json_object(output, '$.confidence') AS confidence,
  get_json_object(output, '$.rationale') AS rationale,
  '' AS correct,
  '' AS expectation,
  '' AS guidance
FROM sandbox.sms_categorization.test_output

In [0]:
%sql
-- Add new test rows to archive
INSERT INTO sandbox.sms_categorization.test_outputs_archive
SELECT * FROM sandbox.sms_categorization.test_output

In [0]:
%sql
SELECT * FROM sandbox.sms_categorization.test_outputs_archive