This notebook takes the information extraction and spins up a vector search for searching across the preamble and extracted information of each contract.

In [0]:
%pip install databricks-vectorsearch

In [0]:
from databricks.vector_search.client import VectorSearchClient
client = VectorSearchClient()

In [0]:
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
endpoint_name = dbutils.widgets.get("endpoint_name")

This checks if the endpoint exists, and if not, generates it

In [0]:
endpoints = [x['name'] for x in client.list_endpoints()['endpoints']]
if not any(ep == endpoint_name for ep in endpoints):
    client.create_endpoint(
        name=endpoint_name,
        endpoint_type="STANDARD"
    )

In [0]:
%sql
ALTER TABLE IDENTIFIER(:catalog || '.' || :schema || '.extracted') SET TBLPROPERTIES (delta.enableChangeDataFeed = true)

Once the index is created, we can also sync it if we update the underlying table

In [0]:
%sql
SELECT * FROM IDENTIFIER(:catalog || '.' || :schema || '.extracted')

In [0]:
try:
  index = client.create_delta_sync_index(
    endpoint_name=endpoint_name,
    source_table_name=f"{catalog}.{schema}.extracted",
    index_name=f"{catalog}.{schema}.index",
    pipeline_type="TRIGGERED",
    primary_key="path",
    columns_to_sync=['path', 'vendor_name', 'file_name', 'other_vendor_files', 'preamble', 'text', 'key_information'],
    embedding_source_column="key_information",
    embedding_model_endpoint_name="databricks-gte-large-en",
  )
except Exception as e:
  print(e)
  index = client.get_index(index_name=f"{catalog}.{schema}.index")
  index.sync()
  print('Sync Complete!')

In [0]:
%sql
SELECT * 
FROM vector_search(
  index => :catalog || '.' || :schema || '.index',
  query_text => 'Purchase Order No. 70000391103',
  query_type => 'HYBRID',
  num_results => 20
)

In [0]:
%sql
SELECT
  array_join(array_agg(key_information), '\n---\n') as key_info,
  array_join(flatten(collect_list(other_vendor_files)), ',') as vendor_files,
  array_join(array_agg(vendor_name), ', ') as vendors
FROM vector_search(
  index => :catalog || '.' || :schema || '.index',
  query_text => 'Purchase Order No. 70000391103',
  query_type => 'HYBRID',
  num_results => 20
)