# Self-Managed Vector Search Index
Configure Databricks Vector Search to ingest data from table the we create in the Data Prepartion notebook


Install the required libraries and include the helper function

In [0]:
%pip install -U --quiet mlflow==2.14.3 databricks-vectorsearch==0.40 transformers==4.43.3 langchain==0.2.11 langchain-community==0.2.10 pydantic==2.8.2 flashrank==0.2.8 accelerate PyPDF2
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%run ../Includes/_helper_functions

Create the vector search endpoint with 'vs_endpoint_' as prefix

In [0]:
# assign vs search endpoint by username
vs_endpoint_prefix = "vs_endpoint_"
vs_endpoint_name = vs_endpoint_prefix + "1"
print(f"Assigned Vector Search endpoint name: {vs_endpoint_name}.")

Assigned Vector Search endpoint name: vs_endpoint_1.


In [0]:
from databricks.vector_search.client import VectorSearchClient
from databricks.sdk import WorkspaceClient
import databricks.sdk.service.catalog as c

vsc = VectorSearchClient(disable_notice=True)

In [0]:
# IF YOU HAVE ENDPOINT CREATION PERMISSIONS, UNCOMMENT THIS CODE AND RUN IT TO CREATE AN ENDPOINT

vsc.create_endpoint(name=vs_endpoint_name, endpoint_type="STANDARD")

In [0]:
wait_for_vs_endpoint_to_be_ready(vsc, vs_endpoint_name)
print(f"Endpoint named {vs_endpoint_name} is ready.")

Endpoint named vs_endpoint_1 is ready.


### Connect Delta Table with Vector Search Endpoint

After creating the endpoint, we created the **vector search index**. The vector search index is created from a Delta table and is optimized to provide real-time approximate nearest neighbor searches.

In [0]:
# the table we'd like to index
source_table_fullname = "workspace.default.pdf_text_embeddings"

# where we want to store our index
vs_index_fullname = "workspace.default.pdf_text_self_managed_vs_index"

# create or sync the index
if not index_exists(vsc, vs_endpoint_name, vs_index_fullname):
  print(f"Creating index {vs_index_fullname} on endpoint {vs_endpoint_name}...")
  vsc.create_delta_sync_index(
    endpoint_name=vs_endpoint_name,
    index_name=vs_index_fullname,
    source_table_name=source_table_fullname,
    pipeline_type="TRIGGERED", # Sync needs to be manually triggered
    primary_key="id",
    embedding_dimension=768, # Match your model embedding size (gte)
    embedding_vector_column="embedding"
  )
else:
  # trigger a sync to update our vs content with the new data saved in the table
  vsc.get_index(vs_endpoint_name, vs_index_fullname).sync()

# let's wait for the index to be ready and all our embeddings to be created and indexed
wait_for_index_to_be_ready(vsc, vs_endpoint_name, vs_index_fullname)

Search for Similar Content

In [0]:
import mlflow.deployments

deploy_client = mlflow.deployments.get_deploy_client("databricks")
question = "what is spam?"
response = deploy_client.predict(endpoint="RAGdatabricks-gte-base-en", inputs={"input": [question]})
embeddings = [e["embedding"] for e in response.data]
print(embeddings)

[[0.2325439453125, -0.0018243789672851562, 0.300537109375, -0.19775390625, 0.70458984375, -0.195068359375, 0.85107421875, -1.67578125, 1.662109375, -0.476318359375, 0.1341552734375, -1.5048828125, 0.441162109375, -0.1219482421875, 1.9208984375, -0.158203125, -1.40625, 0.05926513671875, 0.313720703125, 0.52001953125, -0.12237548828125, -0.71435546875, 0.68798828125, 0.595703125, -0.223388671875, -0.1600341796875, -0.4560546875, 0.335205078125, -0.861328125, -0.68505859375, 0.287841796875, -0.219970703125, 0.65869140625, 0.99267578125, -0.35693359375, -0.245849609375, -1.1181640625, 1.2421875, -0.002536773681640625, -0.505859375, -0.4296875, 0.6826171875, -0.09503173828125, -0.9521484375, 2.4765625, 0.77783203125, -0.58203125, -0.00829315185546875, 0.274169921875, -0.9677734375, 0.2152099609375, 1.48046875, -0.5029296875, -0.9404296875, -0.6455078125, 0.98291015625, 0.6552734375, -0.222412109375, -0.669921875, -1.5947265625, -0.491455078125, 0.07080078125, -0.279296875, -0.454833984375, 

In [0]:
# get similar 5 documents.
results = vsc.get_index(vs_endpoint_name, vs_index_fullname).similarity_search(
  query_vector=embeddings[0],
  columns=["pdf_name", "content"],
  num_results=5)

# format result to align with reranker lib format. 
passages = []
for doc in results.get("result", {}).get("data_array", []):
    new_doc = {"file": doc[0], "text": doc[1]}
    passages.append(new_doc)

print(passages)

[{'file': 'dbfs:/Volumes/workspace/default/raw_data/KB0000029.pdf', 'text': 'Knowledge Details Page 1\nRun By : System Administrator 2024-10-21 08:53:28 Pacific Daylight TimeReport Title: Knowledge Details\nRun Date and Time: 2024-10-21 08:53:28 Pacific Daylight Time\nRun by: System Administrator\nTable name: kb_knowledge\nKnowledge\nNumber: KB0000029\nKnowledge base: IT\nCategory: Email\nPublished: 2014-09-09\nValid to: 2100-01-01Article type: HTML\nWorkflow: Published\nSource Task:\nAttachment link: false\nDisplay attachments: false\nShort description:\nWhat is Spam?\nArticle body:\nWhat is Spam? \nSpam\xa0has increasingly become a problem on the Internet. While every Internet user receives some spam, email addresses posted to web sites or in \nnewsgroups and chat rooms attract the most spam.\' \nDefinitions \nThe term "spam" is\xa0Internet\xa0slang that refers to unsolicited commercial email (UCE) or unsolicited bulk email (UBE). Some people refer to this kind of \ncommunication as 

In [0]:
import os
working_dir = os.getcwd()
print("Current working directory:", working_dir)

Current working directory: /Workspace/Shared/GenAI


Re-ranking Search Results

In [0]:
from flashrank import Ranker, RerankRequest

# Ensure the model file exists at this path or update the path accordingly
cache_dir = "/Workspace/Shared/GenAI/opt"

ranker = Ranker(model_name="rank-T5-flan", cache_dir=cache_dir)

rerankrequest = RerankRequest(query=question, passages=passages)
results = ranker.rerank(rerankrequest)
print(*results[:3], sep="\n\n")

INFO:flashrank.Ranker:Downloading rank-T5-flan...
rank-T5-flan.zip:   0%|          | 0.00/73.7M [00:00<?, ?iB/s]rank-T5-flan.zip:   6%|▋         | 4.62M/73.7M [00:00<00:01, 48.4MiB/s]rank-T5-flan.zip:  13%|█▎        | 9.25M/73.7M [00:00<00:01, 46.0MiB/s]rank-T5-flan.zip:  19%|█▊        | 13.6M/73.7M [00:00<00:01, 43.7MiB/s]rank-T5-flan.zip:  24%|██▍       | 17.8M/73.7M [00:00<00:01, 43.5MiB/s]rank-T5-flan.zip:  30%|██▉       | 22.0M/73.7M [00:00<00:01, 43.5MiB/s]rank-T5-flan.zip:  35%|███▌      | 26.1M/73.7M [00:00<00:01, 43.3MiB/s]rank-T5-flan.zip:  41%|████      | 30.3M/73.7M [00:00<00:01, 43.3MiB/s]rank-T5-flan.zip:  47%|████▋     | 34.4M/73.7M [00:00<00:00, 42.9MiB/s]rank-T5-flan.zip:  52%|█████▏    | 38.5M/73.7M [00:00<00:00, 42.9MiB/s]rank-T5-flan.zip:  58%|█████▊    | 42.6M/73.7M [00:01<00:00, 41.0MiB/s]rank-T5-flan.zip:  64%|██████▍   | 47.0M/73.7M [00:01<00:00, 42.5MiB/s]rank-T5-flan.zip:  70%|██████▉   | 51.3M/73.7M [00:01<00:00, 43.2MiB/s]rank-T5-flan.zip:  75%

{'file': 'dbfs:/Volumes/workspace/default/raw_data/KB0000011.pdf', 'text': "Knowledge List Page 1\nRun By : System Administrator 2024-10-22 22:39:34 Pacific Daylight TimeReport Title: Knowledge List\nRun Date and Time: 2024-10-22 22:39:34 Pacific Daylight Time\nRun by: System Administrator\nTable name: kb_knowledge\nQuery Condition: Number = KB0000011\nSort Order: Number in descending order\n1 Knowledge\n▼ Number Short description Author Category Workflow Updated\nKB0000011 How to Deal with Spam Ron Kettering Email Published 2014-12-19 07:54:36\nKnowledge List Page 2\nRun By : System Administrator 2024-10-22 22:39:34 Pacific Daylight TimeKnowledge\nNumber: KB0000011\nKnowledge base: IT\nCategory: Email\nPublished: 2014-09-09\nValid to: 2100-01-01Article type: HTML\nWorkflow: Published\nSource Task:\nAttachment link: false\nDisplay attachments: false\nShort description:\nHow to Deal with Spam\nArticle body:\nHow to Deal with Spam \nSpam\xa0has increasingly become a problem on the Intern