In [0]:
!pip install -q langchain==0.3.14
!pip install -q langchain-openai==0.3.0
!pip install -q langchain-community==0.3.14
!pip install -q langgraph==0.2.64

In [0]:
%pip install -U -qqqq databricks-langchain uv databricks-agents mlflow-skinny[databricks]

In [0]:
!pip install jq

In [0]:
dbutils.library.restartPython()

In [0]:
%pip install databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
# import json
# file_path = "../../docs/router_agent_documents.json"

# with open(file_path, "r") as f:
#     knowledge_base = json.load(f)

# knowledge_base[:3]

In [0]:
# knowledge_base

In [0]:
from langchain_community.document_loaders import JSONLoader

file_path = "../../docs/router_agent_documents.json"

# Load the document
loader = JSONLoader(file_path,jq_schema='.',
                    text_content=False)
documents = loader.load()

print(f"Loaded {len(documents)} document(s).")

print(documents[0].metadata)
print(documents[0].page_content[:1000])

In [0]:
documents[0].metadata

In [0]:
documents[0].page_content

In [0]:
from langchain_community.document_loaders import JSONLoader

file_path = "../../docs/router_agent_documents.json"

# Load the document
loader = JSONLoader(file_path,jq_schema='.messages[]',
                    text_content=False)
documents = loader.load()

print(f"Loaded {len(documents)} document(s).")

print(documents[0].metadata)
print(documents[0].page_content[:1000])

In [0]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,      # ~800 characters per chunk
    chunk_overlap=100,   # 100 characters overlap to preserve context
)

docs = text_splitter.split_documents(documents)

print(f"Created {len(docs)} chunks.")



print(docs)

In [0]:
catalog = "agentic_ai"
schema = "langgraph"

In [0]:
import json

In [0]:
chunk_data = []
for i, d in enumerate(docs):
    page_content = json.loads(d.page_content)
    chunk_data.append({
    "chunk_id": i + 1,
    "content": page_content["text"],
    "category": page_content["metadata"]["category"]
    })


# Convert to dataframe

spark_df = spark.createDataFrame(chunk_data)

# Save as Delta
spark_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.router_agent_chunks")


In [0]:
display(spark.table("agentic_ai.langgraph.router_agent_chunks"))

In [0]:
spark.sql("ALTER TABLE agentic_ai.langgraph.router_agent_chunks SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

### Now you can create vector serach index using this table, Follow the below doc

https://docs.databricks.com/aws/en/generative-ai/create-query-vector-search

## Create a vector search endpoint

In [0]:
from databricks.vector_search.client import VectorSearchClient

In [0]:
# The following line automatically generates a PAT Token for authentication
client = VectorSearchClient()

# The following line uses the service principal token for authentication
# client = VectorSearchClient(service_principal_client_id=<CLIENT_ID>,service_principal_client_secret=<CLIENT_SECRET>)

client.create_endpoint(
    name="router_agent_endpoint",
    endpoint_type="STANDARD" # or "STORAGE_OPTIMIZED"
)

## Create Vector Search Index

In [0]:
# The following example creates a Delta Sync Index with self-managed embeddings.


endpoint_name="router_agent_endpoint"
source_table_name="agentic_ai.langgraph.router_agent_chunks"
index_name="agentic_ai.langgraph.router_agent_index"


client = VectorSearchClient()

index = client.create_delta_sync_index(
  endpoint_name=endpoint_name,
  source_table_name=source_table_name,
  index_name=index_name,
  pipeline_type="TRIGGERED",
  primary_key="chunk_id",
  embedding_source_column="content",
  embedding_model_endpoint_name="databricks-gte-large-en", # This model is used for ingestion, and is also used for querying unless model_endpoint_name_for_query is specified.
)

## Query a vector search endpoint

In [0]:
# Delta Sync Index with embeddings computed by Databricks

from databricks.vector_search.client import VectorSearchClient

client = VectorSearchClient()

endpoint_name="router_agent_endpoint"
source_table_name="agentic_ai.langgraph.router_agent_chunks"
index_name="agentic_ai.langgraph.router_agent_index"

index = client.get_index(endpoint_name=endpoint_name, index_name=index_name)

results = index.similarity_search(
    query_text="what is your refund policy?",
    columns=["content","category"],  # Ensure only columns present in the index are listed
    num_results=3,
    filters={"category": ["general"]},
    query_type="hybrid"
)
results

In [0]:
# Filter is case sensitive. So if we use General iso general we would not get any similar document.
results = index.similarity_search(
    query_text="what is your refund policy?",
    columns=["content","category"],  # Ensure only columns present in the index are listed
    num_results=3,
    filters={"category": ["General"]},
    query_type="hybrid"
)
results


## Convert results to LangChain documents
The first column retrieved is loaded into page_content, and the rest into metadata.

In [0]:
from langchain.schema import Document
from typing import List

def convert_vector_search_to_documents(results) -> List[Document]:
  column_names = []
  for column in results["manifest"]["columns"]:
      column_names.append(column)

  langchain_docs = []
  for item in results["result"]["data_array"]:
      metadata = {}
      score = item[-1]
      # print(score)
      i = 1
      for field in item[1:-1]:
          # print(field + "--")
          metadata[column_names[i]["name"]] = field
          i = i + 1
      doc = Document(page_content=item[0], metadata=metadata)  # , 9)
      langchain_docs.append(doc)
  return langchain_docs

langchain_docs = convert_vector_search_to_documents(results)

langchain_docs

In [0]:
for elem in langchain_docs:
    print(type(elem))

In [0]:
retrieved_content = "\n\n".join(doc.page_content for doc in langchain_docs)
retrieved_content