# Contact Analysis
## Vector Search
Vector search can supercharge LLMs by providing relevant context. Unfortunately, hundreds of contacts with over 50 pages a piece are too much for LLMs to deal with, so we have done a lot of work to compress and summarize information. This notebook takes the information extraction and spins up a vector search for searching across the preamble, references, metadata, and sections of each contract. We can then use this vector search in AI_QUERY and agentic flows.

In [0]:
%pip install databricks-vectorsearch
%restart_python

In [0]:
from databricks.vector_search.client import VectorSearchClient
client = VectorSearchClient()

In [0]:
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
endpoint_name = dbutils.widgets.get("endpoint_name")
tables = {
  "flat":{
    'columns_to_sync':['path', 'vendor_name', 'file_name', 'vendor_folder_paths', 'preamble'],
    'vs_col':'preamble'
  },
  "doc_info":{
    'columns_to_sync':['path', 'combined_doc_info'],
    'vs_col':'combined_doc_info'
  }, 
  "sections":{
    'columns_to_sync':['path', 'section_id', 'section_title', 'combined_text'],
    'vs_col':'combined_text'
  }
}

## Sections
This code takes our `.flat` table and makes sections for vector search. This is mainly meant for agentic flows so the agent can check the classification for each contract by searching sections.

In [0]:
%sql
CREATE TABLE IF NOT EXISTS IDENTIFIER(:catalog || '.' || :schema || '.sections') (
    path STRING,
    section_id INT,
    section_title STRING,
    text STRING,
    combined_text STRING
)

In [0]:
%sql
MERGE INTO IDENTIFIER(:catalog || '.' || :schema || '.sections') AS target
USING (
  WITH exploded_elements AS (
    SELECT
      p.path,
      element:id::STRING AS element_id,
      element:type::STRING AS element_type,
      element:content::STRING AS element_content
    FROM (SELECT * FROM IDENTIFIER(:catalog || '.' || :schema || '.' || :parsed_table)) AS p,
    LATERAL explode(cast(p.parsed:document:elements AS ARRAY<VARIANT>)) AS t(element)
    WHERE element:id IS NOT NULL
  ),
  section_headers AS (
    SELECT
      path,
      element_id,
      element_content,
      ROW_NUMBER() OVER (PARTITION BY path ORDER BY element_id::INT) AS mono_section_id
    FROM exploded_elements
    WHERE element_type = 'section_header'
  ),
  section_tracking AS (
    SELECT
      e.path,
      e.element_id,
      e.element_type,
      e.element_content,
      sh.mono_section_id AS section_id,
      sh.element_content AS section_title
    FROM exploded_elements e
    LEFT JOIN LATERAL (
      SELECT mono_section_id, element_content
      FROM section_headers sh
      WHERE sh.path = e.path AND sh.element_id::INT <= e.element_id::INT
      ORDER BY sh.element_id::INT DESC
      LIMIT 1
    ) sh ON TRUE
  )
  SELECT
    path,
    section_id,
    section_title,
    CONCAT_WS('\n', COLLECT_LIST(element_content)) AS text,
    CONCAT_WS('\n', section_title, COLLECT_LIST(element_content)) AS combined_text
  FROM section_tracking
  WHERE element_type = 'text' 
    AND section_id IS NOT NULL
  GROUP BY path, section_id, section_title
) AS source
ON target.path = source.path AND target.section_id = source.section_id
WHEN MATCHED THEN
  UPDATE SET
    section_title = source.section_title,
    text = source.text,
    combined_text = source.combined_text
WHEN NOT MATCHED THEN
  INSERT (path, section_id, section_title, text, combined_text)
  VALUES (source.path, source.section_id, source.section_title, source.text, source.combined_text);

In [0]:
%sql
SELECT * 
FROM IDENTIFIER(:catalog || '.' || :schema || '.sections')
LIMIT 3

## Vector Search Setup
This section of the code sets up our vector search tables using the widgets in the notebook. First, we check if the endpoint exists, and if not, generate it. Next, we ensure that change data feed is enabled on all the tables. We then generate our vector search indexes. Once the index is created, we can also sync it if we update the underlying table. 

In [0]:
endpoints = [x['name'] for x in client.list_endpoints()['endpoints']]
if not any(ep == endpoint_name for ep in endpoints):
    client.create_endpoint(
        name=endpoint_name,
        endpoint_type="STANDARD"
    )

In [0]:
for tbl in tables.keys():
    spark.sql(f"""
        ALTER TABLE IDENTIFIER('{catalog}.{schema}.{tbl}') 
        SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """)

In [0]:
for tbl in tables.keys():
    try:
        index = client.create_delta_sync_index(
            endpoint_name=endpoint_name,
            source_table_name=f"{catalog}.{schema}.{tbl}",
            index_name=f"{catalog}.{schema}.{tbl}_index",
            pipeline_type="TRIGGERED",
            primary_key="path",
            columns_to_sync=tables[tbl]['columns_to_sync'],
            embedding_source_column=tables[tbl]['vs_col'],
            embedding_model_endpoint_name="databricks-gte-large-en",
        )
        print(f'Creating index for {tbl}!')
    except Exception as e:
        print(e)
        index = client.get_index(index_name=f"{catalog}.{schema}.{tbl}_index")
        index.sync()
        print(f'Syncing index for {tbl}!')

Now we can test a vector search query

In [0]:
%sql
SELECT * 
FROM vector_search(
  index => :catalog || '.' || :schema || '.flat_index',
  query_text => 'Contract No. 1885-16859 Amendment No. 1',
  query_type => 'HYBRID',
  num_results => 5
)