This notebook takes the information extraction and spins up a vector search for searching across the preamble and extracted information of each contract.

In [0]:
%pip install databricks-vectorsearch

In [0]:
from databricks.vector_search.client import VectorSearchClient
client = VectorSearchClient()

This checks if the endpoint exists, and if not, generates it

In [0]:
endpoint_name = "dbdemos_vs_endpoint"
endpoints = [x['name'] for x in client.list_endpoints()['endpoints']]
if not any(ep == endpoint_name for ep in endpoints):
    client.create_endpoint(
        name=endpoint_name,
        endpoint_type="STANDARD"
    )

In [0]:
%sql
ALTER TABLE shm.contracts.extracted SET TBLPROPERTIES (delta.enableChangeDataFeed = true)

Once the index is created, we can also sync it if we update the underlying table

In [0]:
try:
  index = client.create_delta_sync_index(
    endpoint_name="dbdemos_vs_endpoint",
    source_table_name="shm.contracts.extracted",
    index_name="shm.contracts.index",
    pipeline_type="TRIGGERED",
    primary_key="path",
    columns_to_sync=['path','preamble','truncated','key_information'],
    embedding_source_column="key_information",
    embedding_model_endpoint_name="databricks-gte-large-en",
  )
except Exception as e:
  print(e)
  index = client.get_index(index_name="shm.contracts.index")
  index.sync()