Create Vector Search Index from the processed/Chunked Data stored in Delta table

In [0]:
# %pip install -U -qqqq mlflow>=3.1.4 langchain==0.3.27 langgraph==0.6.11 databricks-langchain pydantic databricks-agents unitycatalog-langchain[databricks] databricks-feature-engineering==0.12.1 protobuf<5  cryptography<43 databricks-mcp
# dbutils.library.restartPython()

%pip install databricks-vectorsearch
dbutils.library.restartPython()




[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


Create Vector Index in existing endpoint

In [0]:
VECTOR_SEARCH_ENDPOINT_NAME = "databricks_doc_index"
catalog = "workspace"
dbName = "default"


#The table we'd like to index
source_table_fullname = f"{catalog}.{dbName}.databricks_docs_processed_chunks"
# Where we want to store our index
vs_index_fullname = f"{catalog}.{dbName}.databricks_index"

In [0]:
from databricks.vector_search.client import VectorSearchClient


vsc = VectorSearchClient(disable_notice=True)
print(f"Creating index {vs_index_fullname} on endpoint {VECTOR_SEARCH_ENDPOINT_NAME}...")
vsc.create_delta_sync_index(
  endpoint_name=VECTOR_SEARCH_ENDPOINT_NAME,
  index_name=vs_index_fullname,
  source_table_name=source_table_fullname,
  pipeline_type="TRIGGERED",
  primary_key="chunk_id",
  embedding_source_column='text', #The column containing our text
  embedding_model_endpoint_name='databricks-gte-large-en' #The embedding endpoint used to create the embeddings
)


print(f"index {vs_index_fullname} on table {source_table_fullname} is ready")

Creating index workspace.default.databricks_index on endpoint databricks_doc_index...
index workspace.default.databricks_index on table workspace.default.databricks_docs_processed_chunks is ready


Retrieval tool with filters

In [0]:
from databricks_langchain import VectorSearchRetrieverTool, ChatDatabricks


# Retriever for API documentation
generic_retriever = VectorSearchRetrieverTool(
    endpoint_name=VECTOR_SEARCH_ENDPOINT_NAME,
    index_name=vs_index_fullname,
    columns=["chunk_id", "doc_id", "text", "url"],
    tool_name="generic_doc_retriever",
    tool_description="Retrieves generic documentation for generic queries.",
    filters={"doc_type": "general"},
    num_results=5,
    disable_notice=True
)

# Retriever for API documentation
api_retriever = VectorSearchRetrieverTool(
    endpoint_name=VECTOR_SEARCH_ENDPOINT_NAME,
    index_name=vs_index_fullname,
    columns=["chunk_id", "doc_id", "text", "url", "doc_type"],
    tool_name="api_docs_retriever",
    tool_description="Retrieves API reference documentation.",
    filters={"doc_type": "api_reference"},
    num_results=5,
    disable_notice=True
)

# Retriever for tutorials
tutorial_retriever = VectorSearchRetrieverTool(
    endpoint_name=VECTOR_SEARCH_ENDPOINT_NAME,
    index_name=vs_index_fullname,
    columns=["chunk_id", "doc_id", "text", "url", "doc_type"],
    tool_name="tutorial_retriever",
    tool_description="Retrieves tutorial and how-to guides.",
    filters={"doc_type": "tutorial"},
    num_results=5,
    disable_notice=True
)

# Retriever for code examples
code_retriever = VectorSearchRetrieverTool(
    endpoint_name=VECTOR_SEARCH_ENDPOINT_NAME,
    index_name=vs_index_fullname,
    columns=["chunk_id", "doc_id", "text", "url", "doc_type", "has_code"],
    tool_name="code_examples_retriever",
    tool_description="Retrieves documentation with code examples.",
    filters={"has_code": "true"},
    num_results=5,
    disable_notice=True
)

# Bind all tools to LLM
llm = ChatDatabricks(endpoint="databricks-qwen3-next-80b-a3b-instruct")
llm_with_tools = llm.bind_tools([generic_retriever, api_retriever, tutorial_retriever, code_retriever])

Test Tool Calling with React Agent

In [0]:
results = tutorial_retriever.invoke("how to use mlflow with databricks jobs")


Trace(trace_id=tr-aa7cef4bcf11bab1d763aa4675fb848c)

In [0]:
from langgraph.prebuilt import create_react_agent

# Create agent that automatically executes tools
agent = create_react_agent(
    llm,
    [generic_retriever, api_retriever, tutorial_retriever, code_retriever]
)

# Invoke - tools are automatically executed
result = agent.invoke({
    "messages": [{"role": "user", "content": "how to use mlflow with databricks jobs"}]
})

# Get the final answer
print(result["messages"][-1].content)

To use MLflow with Databricks jobs, you can integrate MLflow for experiment tracking and model management within scheduled Databricks workflows. Here's how you can do it:

### Key Concepts:
1. **Databricks Jobs**: These are non-interactive mechanisms to run notebooks or libraries on a scheduled or on-demand basis.
2. **MLflow Experiments**: Used to track and organize machine learning training runs within Databricks.

### Steps to Use MLflow with Databricks Jobs:

1. **Set Up MLflow in Your Notebook**:
   Ensure MLflow is properly configured in the notebook that will be part of your Databricks job. Include the necessary imports and tracking setup:
   ```python
   import mlflow
   import mlflow.spark

   # Set the tracking URI to use Databricks' managed MLflow
   mlflow.set_tracking_uri("databricks")

   # Start an MLflow experiment
   mlflow.set_experiment("/path/to/your/experiment")
   ```

2. **Log Metrics and Parameters**:
   Within your training or processing code, log parameters, m

Trace(trace_id=tr-ea8e4378c59114defa0855be0a8b813c)

In [0]:
def trace_agent_execution(query: str):
    """Execute agent and trace all tool calls."""
    
    agent = create_react_agent(llm, [generic_retriever, api_retriever, tutorial_retriever, code_retriever])
    
    result = agent.invoke({
        "messages": [{"role": "user", "content": query}]
    })
    
    # Extract tool call information
    tool_calls = []
    for msg in result["messages"]:
        if hasattr(msg, 'tool_calls') and msg.tool_calls:
            for tc in msg.tool_calls:
                tool_calls.append({
                    'tool': tc['name'],
                    'query': tc['args'].get('query', str(tc['args']))
                })
    
    # Print trace
    print(f"\nQuery: {query}")
    print(f"\nTools Called: {len(tool_calls)}")
    for i, tc in enumerate(tool_calls, 1):
        print(f"  {i}. {tc['tool']}")
        print(f"     Query: {tc['query']}")
    
    print(f"\nFinal Answer:")
    print(result["messages"][-1].content)
    
    return result

# Use it
trace_agent_execution("how to use mlflow with databricks jobs")
trace_agent_execution("show me API docs for authentication")
trace_agent_execution("give me code examples for creating clusters")


Query: how to use mlflow with databricks jobs

Tools Called: 1
  1. tutorial_retriever
     Query: how to use mlflow with databricks jobs

Final Answer:
To use MLflow with Databricks jobs, you can follow these steps based on the provided tutorial documentation:

### 1. **Understand MLflow Integration with Databricks**
MLflow is fully integrated into Databricks for experiment tracking, model training, and model management. Each MLflow experiment organizes training runs, logs metrics, parameters, and models, making it easy to track and manage machine learning workflows within Databricks.

### 2. **Create a Databricks Job**
Databricks Jobs allow you to run notebooks or libraries on a scheduled or on-demand basis. You can use a notebook that includes MLflow code as a task in your job.

- If your entire workflow is in a single notebook, you can schedule the notebook directly from the Databricks notebook UI.
- If your workflow consists of multiple steps (e.g., data ingestion, preparation, a

{'messages': [HumanMessage(content='give me code examples for creating clusters', additional_kwargs={}, response_metadata={}, id='9fff0801-5337-4799-a60b-8513e92cde4d'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_e0c7db80-e3b7-4bf9-9e06-9725f474b2a3', 'function': {'arguments': '{"query": "creating clusters"}', 'name': 'code_examples_retriever'}, 'type': 'function'}]}, response_metadata={'usage': {'prompt_tokens': 1805, 'completion_tokens': 24, 'total_tokens': 1829}, 'prompt_tokens': 1805, 'completion_tokens': 24, 'total_tokens': 1829, 'model': 'qwen3-next-instruct-091725', 'model_name': 'qwen3-next-instruct-091725', 'finish_reason': 'tool_calls'}, id='run--d2d79033-7d20-4d53-a177-280148325ea2-0', tool_calls=[{'name': 'code_examples_retriever', 'args': {'query': 'creating clusters'}, 'id': 'call_e0c7db80-e3b7-4bf9-9e06-9725f474b2a3', 'type': 'tool_call'}]),
  ToolMessage(content=[], name='code_examples_retriever', id='c5c2d065-e154-44b6-b06f-635c8b13ff67', to

[Trace(trace_id=tr-19814dcc98825d4fb7c4de8fcfa79b4b), Trace(trace_id=tr-75f32a4f7d898a51f1e1292b71c32ebc), Trace(trace_id=tr-a41d1aff30abaa9dae852660217d7a94)]

Response with Citation

In [0]:
from langgraph.prebuilt import create_react_agent
from langchain_core.messages import SystemMessage
from databricks_langchain import ChatDatabricks

# System prompt with citation instructions
system_message = SystemMessage(content="""You are a Databricks documentation expert assistant.
IMPORTANT INSTRUCTIONS:
1. Use the retrieval tools to find accurate information
2. Answer questions clearly and concisely
3. ALWAYS include citations with URLs for every piece of information you provide
4. Format citations as numbered references [1], [2], etc.
5. List all source URLs at the end of your response
Response format:
<Your detailed answer with inline citations [1], [2]>
Sources:
[1] <URL from first source>
[2] <URL from second source>.""")

# Create agent
llm = ChatDatabricks(endpoint="databricks-qwen3-next-80b-a3b-instruct")
agent = create_react_agent(
    llm,
    [generic_retriever, api_retriever, tutorial_retriever, code_retriever]
)

# Function to query with system prompt
def query_agent(user_query: str):
    """Query the agent with a user question."""
    result = agent.invoke({
        "messages": [
            system_message,  # System prompt (stays the same)
            {"role": "user", "content": user_query}  # User query (dynamic)
        ]
    })
    return result["messages"][-1].content

# Use it with different queries
answer1 = query_agent("how to use mlflow with databricks jobs")
print("*"*70)
print(answer1)

answer2 = query_agent("explain authentication methods")
print("*"*70)
print(answer2)

answer3 = query_agent("show me code examples for creating clusters")
print("*"*70)
print(answer3)

**********************************************************************
To use MLflow with Databricks Jobs, you can integrate MLflow experiment tracking and model management into your scheduled or automated workflows. Below is a step-by-step guide based on official Databricks documentation:

### 1. **Use MLflow with Job Tasks**
Databricks Jobs allow you to run notebooks, JARs, or Python scripts as tasks. When you run MLflow-enabled code inside these tasks, MLflow automatically logs metrics, parameters, and artifacts to the Databricks MLflow Tracking server.

- Define your MLflow experiment in your notebook or script before training a model:
  ```python
  import mlflow

  mlflow.set_experiment("/Users/your-email@databricks.com/your-experiment-name")
  with mlflow.start_run():
      # Your model training code here
      mlflow.log_param("learning_rate", 0.01)
      mlflow.log_metric("accuracy", 0.95)
      mlflow.sklearn.log_model(model, "model")
  ```
  
- Place this code in a notebook a

[Trace(trace_id=tr-a4f5a58f3adcb6d003765f472a0d91c6), Trace(trace_id=tr-ac058d90647dcc2b74f1c8775af60e98), Trace(trace_id=tr-e4fa974c601cea428625d0137ba29567), Trace(trace_id=tr-e842f0eb63a5979962fb580b37cc009b), Trace(trace_id=tr-7e33dea57ec6ce177d569dd27d3d15dc), Trace(trace_id=tr-34b026f77c07d48c2a4b895337ff55b4), Trace(trace_id=tr-51e977ca1b9f22eea678900014b40515), Trace(trace_id=tr-feecd23753d11727db67681a702db470), Trace(trace_id=tr-7c6197949ac362687c65220afed1c460), Trace(trace_id=tr-d546ff3c0ebff95bdb38b4528b5f29eb)]

In [0]:
# Test with tracing to see if URLs are being retrieved
result = agent.invoke({
    "messages": [{"role": "user", "content": "how to use mlflow with databricks jobs"}]
})

print("=" * 70)
print("FINAL ANSWER:")
print("=" * 70)
print(result["messages"][-1].content)

print("\n" + "=" * 70)
print("RETRIEVED SOURCES:")
print("=" * 70)

# Check what URLs were retrieved
for msg in result["messages"]:
    if hasattr(msg, 'name') and 'retriever' in str(msg.name):
        print(f"\nTool: {msg.name}")
        # Print first 200 chars of response to see if URLs are there
        print(str(msg.content)[:500])

Log MLflow experiments and analyse tool calls

In [0]:
import mlflow

mlflow.set_experiment("/Users/shrashti.90@gmail.com/rag-agent-testing")

test_queries = [
    "how to use mlflow with databricks jobs",
    "show me API docs for authentication",
    "give me code examples for creating clusters",
    "How to use Databricks platform for building end-to-end agentic application"
]

agent = create_react_agent(llm, [generic_retriever, api_retriever, tutorial_retriever, code_retriever])

for i, query in enumerate(test_queries, 1):
    with mlflow.start_run(run_name=f"query_{i}"):
        mlflow.log_param("query", query)
        
        result = agent.invoke({
            "messages": [{"role": "user", "content": query}]
        })
        
        # Count tool calls
        tool_calls = []
        for msg in result["messages"]:
            if hasattr(msg, 'tool_calls') and msg.tool_calls:
                tool_calls.extend([tc['name'] for tc in msg.tool_calls])
        
        mlflow.log_metric("num_tool_calls", len(tool_calls))
        mlflow.log_param("tools_used", ", ".join(set(tool_calls)))
        mlflow.log_text(result["messages"][-1].content, "answer.txt")
        
        print(f"Query {i}: {len(tool_calls)} tool calls")

2026/01/03 18:54:56 INFO mlflow.tracking.fluent: Experiment with name '/Users/shrashti.90@gmail.com/rag-agent-testing' does not exist. Creating a new experiment.


Query 1: 1 tool calls
Query 2: 1 tool calls
Query 3: 1 tool calls
Query 4: 1 tool calls


[Trace(trace_id=tr-14d29a84cba7c284edf3e0348ca9b538), Trace(trace_id=tr-d9e94880734dd4a8829ccd3d6e0b8f7b), Trace(trace_id=tr-67553bc800b2664b32d5970c7a14b53d), Trace(trace_id=tr-52f4dd276247d4e746b176444df8c572)]

Clean up Index

In [0]:
# List all indexes on an endpoint

VECTOR_SEARCH_ENDPOINT_NAME = "databricks_doc_index"

indexes = vsc.list_indexes(VECTOR_SEARCH_ENDPOINT_NAME)

print("Existing indexes:")
for index in indexes.get('vector_indexes', []):
    print(f"  - {index['name']}")

Existing indexes:
  - workspace.default.db_docs_index
  - workspace.default.databricks_index


In [0]:


# Delete a specific index
index_name = "workspace.default.databricks_docs_processed_index"

try:
    vsc.delete_index(
        endpoint_name=VECTOR_SEARCH_ENDPOINT_NAME,
        index_name=index_name
    )
    print(f"✓ Deleted index: {index_name}")
except Exception as e:
    print(f"Error deleting index: {e}")

✓ Deleted index: workspace.default.databricks_docs_processed_index
