In [0]:
-- CONTRACT ANALYSIS
-- -- References

CREATE TABLE IF NOT EXISTS IDENTIFIER(:catalog || '.' || :schema || '.references') (
  path STRING,
  agreements ARRAY<STRING>,
  references ARRAY<STRUCT<section: STRING, document: STRING>>,
  combined_references STRING
)
USING DELTA;

MERGE INTO IDENTIFIER(:catalog || '.' || :schema || '.references') AS target
USING (
  WITH ref_results AS (
    SELECT
      f.path,
      from_json(
        AI_QUERY(
          "databricks-claude-sonnet-4-5",
          SUBSTRING(CONCAT(
            'You are a contract analysis expert. Your task is to identify all agreements referenced in the document and all referenced documents or attachments mentioned in any section, using both text and contextual clues from filename and folder structure.
            
            Step 1: Identify Referenced Agreements
            Scan the document for any specific agreements with identifiers or numbers (e.g., Master Agreement 1239-12900). Include the agreement type and its reference number. Common types include (but are not limited to):
            
            Master Agreement
            Framework Agreement
            Consulting Agreement (CSA)
            NDA / Confidentiality Agreement
            Purchase Order Terms and Conditions
            Mutually Agreed Terms and Conditions (MTC)
            Contract
            Master Work Agreement (MWA)
            Sales Contract
            Engineering Procurement Construction (EPC)
            Engineering Procurement Construction Management (EPCM)
            Construction Agreement
            Site Services Agreement
            Staffing Agreement
            Sales/Catering Contract
            Recruitment Agreement
            Administration Services Agreement
            Services Agreement
            License Agreement
            Supply Agreement
            Order Form
            Purchase Agreement
            General Terms and Conditions
            
            Important:
            
            Only include agreements that have a specific identifier or number.
            Ignore generic mentions like “the agreement” unless paired with a unique reference.
            Check for completeness: If multiple agreements are referenced, include all of them, not just the first one found.
            
            Step 2: Identify Referenced Documents
            For each section of the contract, list any real referenced documents or attachments that include a name and/or identifier (e.g., Economic Disclosure Statement: Ownership Interest Declaration (EDS-7: 3/2015)). Common references include:
            
            Amendments
            Rate Sheets
            Schedules
            Exhibits
            Addendums
            Statement of Work (SOW)
            Termination Notices
            Forms of Undertaking (FOU)
            Commitment Letters
            Change Orders
            
            Important:
            
            Do not include hypothetical or generic references.
            Only capture actual documents with identifiers or names.
            Check for completeness: If multiple referenced documents appear in different sections, include all of them.
            
            Filename and Folder Heuristics (Supportive Clues Only):
            
            If the filename contains tokens like SOW, RateSheet, Schedule, Exhibit, Addendum, Amendment, or ChangeOrder, treat it as a strong clue that the document is a referenced attachment.
            If the folder path groups documents together (e.g., a folder named MasterAgreement_1239 contains multiple files), assume that related reference documents (amendments, schedules, rate sheets) are likely in the same folder.
            These clues must not replace reading the document content, but they can help confirm or strengthen associations.
            
            Output Format:
            Return the results in JSON format:
            {
              "agreements": [
                "Master Agreement 1239-12900",
                "Consulting Agreement CSA-456"
              ],
              "references": [
                {"section": "Section 5 - Pricing", "document": "Rate Sheet RS-2024"},
                {"section": "Appendix A", "document": "Statement of Work SOW-789"}
              ]
            }
            
            Completeness Check in Example:
            
            Multiple agreements listed under "agreements".
            Multiple references from different sections under "references".
            Each reference includes both section name and document name/identifier.
            
            If no agreements or references are found, return empty arrays.
            ## Document ##',
            'Vendor Name:', f.vendor_name, '\n',
            'File Name:', f.file_name, '\n',
            'Text:', f.truncated, '\n'
          ),1,:max_input_char),
        responseFormat => 'STRUCT<result:STRUCT<
            agreements:ARRAY<STRING>, 
            references:ARRAY<STRUCT<
                section:STRING, 
                document:STRING
            >>
        >>'),
        'STRUCT<agreements:ARRAY<STRING>, references:ARRAY<STRUCT<section:STRING, document:STRING>>>'
      ) as result
    FROM IDENTIFIER(:catalog || '.' || :schema || '.flat') f
    ANTI JOIN IDENTIFIER(:catalog || '.' || :schema || '.references') r
      ON f.path = r.path
  )
  SELECT 
    path,
    result.agreements as agreements,
    result.references as references,
    CONCAT_WS('\n', 'AGREEMENTS', array_join(result.agreements, ', '), '\n REFERENCES',
      CASE 
        WHEN result.references IS NOT NULL AND size(result.references) > 0 
        THEN ', ' || array_join(transform(result.references, x -> x.section || ': ' || x.document), ', ')
        ELSE ''
      END
    ) as combined_references
  FROM ref_results
  LIMIT CAST(:batch_size AS INT)
) AS source
ON target.path = source.path
WHEN NOT MATCHED THEN INSERT *;