In [0]:
CREATE TABLE IF NOT EXISTS IDENTIFIER(:catalog || '.' || :schema || '.classified') (
  path STRING,
  preamble STRING,
  references STRING,
  metadata ARRAY<STRING>,
  classification STRUCT<
    MasterAgreements: ARRAY<STRUCT<
      FileName: STRING,
      RelatedDocuments: STRUCT<
        Amendments: ARRAY<STRUCT<FileName: STRING, Date: STRING, Sequence: STRING>>,
        LatestAmendment: STRUCT<FileName: STRING, Date: STRING, Sequence: STRING>,
        RateSheets: ARRAY<STRING>,
        OtherRelatedFiles: ARRAY<STRING>
      >
    >>,
    UnclassifiedFiles: ARRAY<STRUCT<FileName: STRING, Title: STRING>>,
    Rationale: STRING,
    Confidence: DOUBLE
  >
);

MERGE INTO IDENTIFIER(:catalog || '.' || :schema || '.classified') AS target
USING (
  WITH ai_results AS (
    SELECT 
      e.path,
      e.metadata,
      e.preamble,
      array_agg(vs.key_information) AS vector_search_results,
      AI_QUERY(
          "databricks-claude-sonnet-4-5",
          CONCAT(
            'You are a contract document classification expert. \n\n',
            'Given a master agreement document (with its preamble and key information) and a set of related documents (vector search results), classify each related document as one of: Amendment, RateSheet, OtherRelatedFile, or Unclassified. \n\n',
            
            'Use the content of the preamble and key information from the master agreement, and the full text and key information from each related document for classification. Note that there are numerous files within the same folder that are more likely to be related to the contract than others. For amendments, determine their sequence (e.g., Amendment 1, Amendment 2) using the date found inside the document key information to establish chronological order. Highlight the latest amendment for each master agreement.\n\n',
            'Return ONLY valid JSON with this structure: {"MasterAgreements":[{"FileName":"...","RelatedDocuments":{"Amendments":[{"FileName":"...","Date":"...","Sequence":"..."}],"LatestAmendment":{"FileName":"...","Date":"...","Sequence":"..."},"RateSheets":["..."],"OtherRelatedFiles":["..."]}}],"UnclassifiedFiles":[{"FileName":"...","Title":"..."}],"Rationale":"...","Confidence":0.0} \n\n',
            
            'Use internal document content for classification, not file names. \n',
            'Ignore duplicates. \n',
            'Flag uncertain documents as UnclassifiedFiles with the full file name from the key information.\n',
            'Provide your rationale and confidence on a scale of 1 to 5, with 5 being perfectly confident.\n\n',
            'Master Agreement Path: ', e.path, '\n',
            'Master Key Information: ', e.key_information, '\n',
            'Master Preamble: ', e.preamble, '\n',
            'Related Documents: ', array_join(array_agg(vs.key_information), '\n---\n'), '\n',
            'Other Vendor Files:', array_join(flatten(collect_list(vs.other_vendor_files)), ','), '\n',
            'Vendors', array_join(array_agg(vs.vendor_name), ', '), '\n'
            ),
        responseFormat => '{"type": "json_schema", "json_schema": {"name": "classification", "schema": {"type": "object", "properties": {"MasterAgreements": {"type": "array", "items": {"type": "object", "properties": {"FileName": {"type": "string"}, "RelatedDocuments": {"type": "object", "properties": {"Amendments": {"type": "array", "items": {"type": "object", "properties": {"FileName": {"type": "string"}, "Date": {"type": "string"}, "Sequence": {"type": "string"}}}}, "LatestAmendment": {"type": "object", "properties": {"FileName": {"type": "string"}, "Date": {"type": "string"}, "Sequence": {"type": "string"}}}, "RateSheets": {"type": "array", "items": {"type": "string"}}, "OtherRelatedFiles": {"type": "array", "items": {"type": "string"}}}}}}}, "UnclassifiedFiles": {"type": "array", "items": {"type": "object", "properties": {"FileName": {"type": "string"}, "Title": {"type": "string"}}}}, "Rationale": {"type": "string"}, "Confidence": {"type": "number"}}}, "strict": true}}'
      ) AS ai_response_json
    FROM shm.contracts.extracted e
    LEFT JOIN LATERAL (
      SELECT * FROM vector_search(
        index => :catalog || '.' || :schema || '.index',
        query_text => e.key_information,
        query_type => 'HYBRID',
        num_results => 20
      )
    )
    LEFT JOIN LATERAL (
      SELECT * FROM vector_search(
        index => :catalog || '.' || :schema || '.index',
        query_text => e.key_information,
        query_type => 'HYBRID',
        num_results => 20
      )
    )
    LEFT JOIN LATERAL (
      SELECT * FROM vector_search(
        index => :catalog || '.' || :schema || '.index',
        query_text => e.key_info,
        query_type => 'HYBRID',
        num_results => 20
      )
    ) vs ON TRUE
    GROUP BY e.path, e.key_information, e.preamble
  )
  SELECT 
    path,
    key_information,
    preamble,
    vector_search_results,
    from_json(
      ai_response_json,
      'STRUCT<MasterAgreements:ARRAY<STRUCT<FileName:STRING, RelatedDocuments:STRUCT<Amendments:ARRAY<STRUCT<FileName:STRING, Date:STRING, Sequence:STRING>>, LatestAmendment:STRUCT<FileName:STRING, Date:STRING, Sequence:STRING>, RateSheets:ARRAY<STRING>, OtherRelatedFiles:ARRAY<STRING>>>>, UnclassifiedFiles:ARRAY<STRUCT<FileName:STRING, Title:STRING>>, Rationale:STRING, Confidence:DOUBLE>'
    ) AS classification
  FROM ai_results
) AS source
ON target.path = source.path
WHEN NOT MATCHED THEN
  INSERT (
    path,
    key_information,
    preamble,
    vector_search_results,
    classification
  )
  VALUES (
    source.path,
    source.key_information,
    source.preamble,
    source.vector_search_results,
    source.classification
  )