# Spend Categorization

## Prompt Optimizer and Evaluation
The batch inference above works at scale, but the prompt engineering is a major challenge - we have a large hierarchy with information, and no ground truth. MLflow allows us to do some work with prompt optimization and guidance. This notebook bootstraps that process.

In [0]:
%pip install mlflow openai --upgrade
%restart_python

In [None]:
from src.utils import get_spark

spark = get_spark()


In [0]:
df = spark.sql("SELECT * FROM shm.spend.test").toPandas()

In [0]:
LLM_ENDPOINT = "databricks-claude-sonnet-4"
EXPERIMENT = "/Workspace/Users/scott.mckean@databricks.com/experiments/spend"

In [0]:
import mlflow
import openai
from databricks.sdk import WorkspaceClient

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment(EXPERIMENT)     

mlflow.openai.autolog()         

w = WorkspaceClient()
user_name = w.current_user.me().user_name
user_id = w.current_user.me().id
openai_client = w.serving_endpoints.get_open_ai_client()

experiment = w.experiments.get_by_name(EXPERIMENT).experiment

In [0]:
def build_messages(
  system_prompt: str, 
  categories: dict, 
  invoice_text: str
  ):
    cats = "\n".join(f"{k}: {v}" for k, v in categories.items())
    return [
        {
            "role": "system",
            "content": system_prompt + "\n\nCategories:\n" + cats,
        },
        {
            "role": "user",
            "content": f"Invoice:\n{invoice_text}",
        },
    ]

system_prompt = "You classify invoices into exactly one category and explain why."

categories = {
    "UTILITIES": "Electricity, water, gas, and other recurring utility bills.",
    "SAAS": "Software-as-a-service subscriptions and cloud tools.",
    "OTHER": "Anything that does not fit utilities or SaaS.",
}

def classify_invoice(
  openai_client, model, invoice_text: str, temperature=0.1):
    messages = build_messages(system_prompt, categories, invoice_text)

    # This call is automatically traced (inputs, outputs, latency, etc.)
    resp = openai_client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.1,
    )
    return messages, resp.choices[0].message.content


In [0]:
classify_invoice(
  openai_client, 
  LLM_ENDPOINT, 
  df.sample(1).iloc[0].combined
  )

In [0]:
mlflow_trace_id =mlflow.search_traces(
    experiment_ids=[experiment.experiment_id],
    order_by=['timestamp DESC'],
    max_results=1
).iloc[0].trace_id

In [0]:
mlflow_trace_id

In [0]:
import mlflow
from mlflow.entities import AssessmentSource, AssessmentSourceType

mlflow.log_feedback(
    trace_id=mlflow_trace_id,
    name="correctness",
    value=4, # correctness score (0: dismal - 10: perfect)
    source=AssessmentSource(
        source_type=AssessmentSourceType.HUMAN, source_id=user_name
    ),
    )

# freeform feedback
mlflow.log_feedback(
    trace_id=mlflow_trace_id,
    name="guidance",
    value="""
        Category is correct the rationale is wrong, it should be other because there isn't enough data
    """,
    source=AssessmentSource(
        source_type=AssessmentSourceType.HUMAN, source_id=user_name
    ),
    )

mlflow.log_expectation(
    trace_id=mlflow_trace_id,
    name="cat_lvl_1",
    value="OTHER",
    source=AssessmentSource(
        source_type=AssessmentSourceType.HUMAN,
        source_id=user_name,
    ),
)

mlflow.log_expectation(
    trace_id=mlflow_trace_id,
    name="cat_lvl_2",
    value="test",
    source=AssessmentSource(
        source_type=AssessmentSourceType.HUMAN,
        source_id=user_name,
    ),
)