This notebook takes the information extraction and spins up a vector search for searching across the preamble, references, metadata, and sections of each contract.

In [0]:
%pip install databricks-vectorsearch

In [0]:
from databricks.vector_search.client import VectorSearchClient
client = VectorSearchClient()

In [0]:
%sql
SELECT * FROM shm.contracts.sections

In [0]:
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
endpoint_name = dbutils.widgets.get("endpoint_name")
tables = {
  "flat":{
    'columns_to_sync':['path', 'vendor_name', 'file_name', 'other_vendor_files', 'preamble'],
    'vs_col':'preamble'
  }, 
  "references":{
    'columns_to_sync':['path', 'combined_references'],
    'vs_col':'combined_references'
  },
  "metadata":{
    'columns_to_sync':['path', 'combined_metadata'],
    'vs_col':'combined_metadata'
  }, 
  "sections":{
    'columns_to_sync':['path', 'section_id', 'section_title', 'combined_text'],
    'vs_col':'combined_text'
  },
}

This checks if the endpoint exists, and if not, generates it

In [0]:
endpoints = [x['name'] for x in client.list_endpoints()['endpoints']]
if not any(ep == endpoint_name for ep in endpoints):
    client.create_endpoint(
        name=endpoint_name,
        endpoint_type="STANDARD"
    )

Once the index is created, we can also sync it if we update the underlying table. This requires change data feed, so we enable it for each table.

In [0]:
for tbl in tables.keys():
    spark.sql(f"""
        ALTER TABLE IDENTIFIER('{catalog}.{schema}.{tbl}') 
        SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """)

We are going to make three vector search tables:
- The Preamble from each contract
- The References from each contract
- The Metadata from each contract
- The Sections from each contract

In [0]:
tables[tbl]

In [0]:
for tbl in tables.keys():
    try:
        index = client.create_delta_sync_index(
            endpoint_name=endpoint_name,
            source_table_name=f"{catalog}.{schema}.{tbl}",
            index_name=f"{catalog}.{schema}.{tbl}_index",
            pipeline_type="TRIGGERED",
            primary_key="path",
            columns_to_sync=tables[tbl]['columns_to_sync'],
            embedding_source_column=tables[tbl]['vs_col'],
            embedding_model_endpoint_name="databricks-gte-large-en",
        )
        print(f'Creating index for {tbl}!')
    except Exception as e:
        print(e)
        index = client.get_index(index_name=f"{catalog}.{schema}.{tbl}_index")
        index.sync()
        print(f'Sync Complete for {tbl}!')

We can pull the entire table now

In [0]:
%sql
SELECT * 
FROM vector_search(
  index => :catalog || '.' || :schema || '.flat_index',
  query_text => 'TAZA Supplies',
  query_type => 'HYBRID',
  num_results => 4
)

We can also concat our vendor files and vendors

In [0]:
%sql
SELECT
  string_agg(concat_ws(' | ', *), '\n') AS table_text
FROM vector_search(
  index => :catalog || '.' || :schema || '.flat_index',
  query_text => 'Purchase Order No. 70000391103',
  query_type => 'HYBRID',
  num_results => 3
)