In [0]:
-- initial declaration of the table if not present
CREATE TABLE IF NOT EXISTS shm.contracts.parsed (
  path STRING,
  modificationTime TIMESTAMP,
  length BIGINT,
  _metadata STRUCT<file_path: STRING, file_name: STRING, file_size: BIGINT, file_block_start: BIGINT, file_block_length: BIGINT, file_modification_time: TIMESTAMP>,
  vendor_name STRING,
  file_name STRING,
  other_vendor_files ARRAY<STRING>,
  parsed VARIANT
);

-- merge into prevents old files from being parsed again
-- This takes quite a while for a large number of documents (more pages = more time)
-- We aren't describing figures or illustrations to save time, nor saving images
MERGE INTO shm.contracts.parsed AS target
USING (
  WITH main_files AS (
    SELECT
      path,
      modificationTime,
      length,
      _metadata,
      content,
      regexp_extract(path, 'raw/([^/]+)/', 1) AS vendor_name,
      regexp_extract(path, '/([^/]+)$', 1) AS file_name
    FROM READ_FILES('/Volumes/shm/contracts/raw/*', format => 'binaryFile', recursiveFileLookup => true)
  ),
  all_vendor_files AS (
    SELECT
      regexp_extract(path, 'raw/([^/]+)/', 1) AS vendor_name,
      regexp_extract(path, '/([^/]+)$', 1) AS file_name
    FROM READ_FILES('/Volumes/shm/contracts/raw/*', format => 'binaryFile', recursiveFileLookup => true)
  )
  SELECT
    m.path,
    m.modificationTime,
    m.length,
    m._metadata,
    m.vendor_name,
    m.file_name,
    (
      SELECT collect_list(avf.file_name)
      FROM all_vendor_files avf
      WHERE avf.vendor_name = m.vendor_name
        AND avf.file_name != m.file_name
    ) AS other_vendor_files,
    AI_PARSE_DOCUMENT(m.content) AS parsed
  FROM main_files m
) AS source
ON target.path = source.path
WHEN NOT MATCHED THEN
  INSERT *