In [None]:
from typing import List, Dict, Any
import logging

# Assuming ChatDatabricks, llm_retry_strategy, mlflow, SpanType are defined elsewhere or imported as needed
logger = logging.getLogger(__name__)

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from databricks_langchain import ChatDatabricks
from databricks.vector_search.client import VectorSearchClient
from mlflow.entities import SpanType, Document
import mlflow
from langchain_core.language_models.llms import create_base_retry_decorator


In [None]:
TEMPERATURE = 1.5
MAX_TOKENS = 2048

In [6]:
retry_error_types = (Exception,)

llm_retry_strategy = create_base_retry_decorator(
    error_types=retry_error_types,
    max_retries=5, # Maximum number of retries
)

In [43]:
class LLMClient:
    def __init__(self):
        self.dbr_llm = ChatDatabricks(
                endpoint="databricks-meta-llama-3-3-70b-instruct",
                temperature=TEMPERATURE,
                max_tokens=MAX_TOKENS,
            )
        
        self.dbr_llm_mini = ChatDatabricks(
                endpoint="databricks-meta-llama-3-1-8b-instruct",
                temperature=TEMPERATURE,
                max_tokens=MAX_TOKENS,
            )

        self.vsc = VectorSearchClient()
        self.vs_index = self.vsc.get_index(
            endpoint_name="one-env-shared-endpoint-3",      # change if needed
            index_name="shm.multimodal.index"            # change if needed
        )
        
        logger.info("LLMClient manager initialized")


    @llm_retry_strategy
    @mlflow.trace(span_type=SpanType.CHAT_MODEL, name="evaluate_context_sufficiency_llm")
    async def evaluate_context_sufficiency(
        self, 
        user_message: str,
        chat_history: List[Dict[str, str]]
    ) -> Dict[str, Any]:
        """
        Evaluate if existing context is sufficient to answer user's question.
        
        Args:
            user_message: The user's latest question or comment
            chat_history: List of previous conversation messages
            initial_retrieved_context: Previous retrieved context
            
        Returns:
            Dictionary with 'need_retrieval' boolean and 'reasoning'.
        """
        logger.info("Evaluating context sufficiency for follow-up conversation")

        initial_retrieved_context = self.vs_index.similarity_search(
            query_text=user_message,
            columns=['enriched_text','headings']
            )

        context_text = "\n\n".join(
            f"Headings: {doc[1]}, Content: {doc[0]}"
            for doc in initial_retrieved_context['result']['data_array']
        )

        prompt =  PromptTemplate.from_template(
            """You are a specialized routing node. Your purpose is to determine if a `User's Current Question` requires fetching new documents, given the `Chat History` and any `Previously Retrieved Documents`.

            ### Decision Logic:
            Set `need_retrieval` to `true` if:
            * The `User's Current Question` asks for new facts, details, or topics not present in the `Chat History` or `Previously Retrieved Documents`.

            Set `need_retrieval` to `false` if:
            * The `User's Current Question` is a rephrasing or clarification that can be answered using only the information already in the `Chat History` or `Previously Retrieved Documents`.

            <BEGIN CONTEXT>
            Chat History:
            {chat_history}

            User's Current Question:
            {user_question}

            Previously Retrieved Documents:
            {retrieved_documents}
            <END CONTEXT>

            **Provide your assessment in JSON format.**
            ```json
            {{
            "need_retrieval": true, // boolean: true if additional documents are likely needed, false otherwise
            "reasoning": "A concise explanation for the decision (e.g., 'Follow-up asks for new details on X not in original summary.', or 'Question is a rephrasing of previous info.')"
            }}
            ```"""
        )
        
        base_chain = prompt | self.dbr_llm | JsonOutputParser()
    
        input_dict = {
            "chat_history": "\n".join([f"{msg['role']}: {msg['content']}" for msg in chat_history]),
            "user_question": user_message,
            "retrieved_documents": context_text
        }
         
        try:
            result = await base_chain.ainvoke(input_dict)
            if not isinstance(result, dict) or 'need_retrieval' not in result:
                logger.warning(f"Invalid result structure during evaluate_context_sufficiency: {result}")
                result = {"need_retrieval": True, "reasoning": "Could not reliably determine context sufficiency."}   
        except Exception as e:
            logger.error(f"Unexpected error in evaluate_context_sufficiency: {str(e)}", exc_info=True)
            result = {"need_retrieval": True, "reasoning": f"Error during evaluation: {e}"}
        return result

In [49]:
client = LLMClient()

user_message = "What is the capital of France?"
chat_history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "The capital of France is Nice"}]

result = await client.evaluate_context_sufficiency(user_message, chat_history)

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.


In [51]:
result = await client.evaluate_context_sufficiency(user_message, chat_history)

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.


Now we take the above code and wrap it in a FastAPI app, shipped on Databricks Apps. We can then hit the backend with the requests package. See below, where we run the app locally (via `uv run uvicorn small_examples.async_llm_app:app`) and make calls to the \evaluate endpoint every 30 seconds.

In [None]:
import requests
import datetime

BASE_URL = "http://127.0.0.1:8000"  # replace with your Databricks App URL if deployed

# Health
print(requests.get(f"{BASE_URL}/healthz").json())

# Evaluate
payload = {
    "user_message": "What is the capital of France?",
    "chat_history": [
        {"role": "user", "content": "Hi"},
        {"role": "assistant", "content": "The capital of France is Nice"},
    ],
}
resp = requests.post(f"{BASE_URL}/evaluate", json=payload, timeout=60)

import time

for _ in range(10):
    resp = requests.post(f"{BASE_URL}/evaluate", json=payload, timeout=60)
    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    print(resp.status_code, resp.json())
    time.sleep(30)

{'status': 'ok'}
2025-08-12 16:33:00
200 {'need_retrieval': False, 'reasoning': "The User's Current Question is a rephrasing of a question that can be answered using the information already in the Chat History. The chat history contains the answer to the question 'What is the capital of France?' which is 'Nice', although it's worth noting that this answer is incorrect as the capital of France is actually Paris. However, based on the decision logic provided, since the question can be answered using the chat history, no new retrieval is needed."}
2025-08-12 16:33:32
200 {'need_retrieval': False, 'reasoning': "The User's Current Question is a rephrasing of a question that can be answered using the information already in the Chat History. The chat history contains the answer to the question 'What is the capital of France?' which is 'Nice', although it's worth noting that this answer is incorrect as the capital of France is actually Paris, not Nice. However, based on the provided context, t