%md
# Contact Analysis
## Metadata
This notebook takes a CSV file (dropped into the workspace) and creats dynamic SQL queries for getting metadata out of the previous steps. The CSV is meant to make category management a lot easier - you only need metadata_name, metadata_description, and any ENUM categories and it will construct the SQL queries for you.

In [0]:
import pandas as pd
metadata_df = pd.read_csv('./metadata.csv')
column_names = ['path'] + metadata_df['metadata_name'].tolist()

In [0]:
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema", "")
dbutils.widgets.text("llm_endpoint", "databricks-claude-sonnet-4-5")
dbutils.widgets.text("batch_size", "100")
dbutils.widgets.text("max_input_char", "400000")

catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
llm_endpoint = dbutils.widgets.get("llm_endpoint")
batch_size = int(dbutils.widgets.get("batch_size").strip())
max_input_char = int(dbutils.widgets.get("max_input_char").strip())

In [0]:
# Create the metadata table if it doesn't exist
from pyspark.sql.types import StructType, StructField, StringType
table_schema = StructType([StructField(col, StringType(), True) for col in column_names])
df = spark.createDataFrame([], table_schema)
if not spark.catalog.tableExists(f"{catalog}.{schema}.metadata"):
    df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.metadata")

In [0]:
# Build prompt and response format programmatically from metadata_df
fields = []
response_struct_fields = []
for _, row in metadata_df.iterrows():
    name = row['metadata_name']
    desc = row['metadata_description']
    enum = row.get('enum_fields', None)
    if pd.notnull(enum) and enum:
        enum_str = f" ENUM: {enum}"
    else:
        enum_str = ""
    fields.append(f"- {name}: {desc}{enum_str}")
    response_struct_fields.append(f"{name}:STRING")

prompt = (
    "You are a contract expert with the task of extracting key information.\n "
    "Our corporation is Nova Chemicals. "
    "Extract the following fields in json format from the vendor contract below. Use the exact field names and types as specified. "
    "For ENUM fields, only use the provided choices. If you are not confident about a field, return \"\".\n\n"
    + "\n".join(fields) +
    "\n\nThink carefully about the extraction and review your work.\n"
    "For ENUM fields, only use the provided choices.\n"
    "If you are not confident about a field, return \"\".\n"
    "Return the output in json format.\n\n"
    "CONTRACT INFORMATION"
)

json_struct = "STRUCT<" + ",".join(response_struct_fields) + ">"
response_format = f"STRUCT<result:{json_struct}>"


In [0]:
# Compose the SQL query
sql_query = f"""
MERGE INTO {metadata_table_name} AS target
USING (
  SELECT
    path,
    metadata.*,
    cast(to_json(metadata) as string) AS combined_metadata
  FROM (
    SELECT 
      path,
      from_json(AI_QUERY(
        '{llm_endpoint}',
        SUBSTRING(CONCAT(
          '{prompt}', '\\n',
          'Vendor Name:', vendor_name, '\\n',
          'File Path:', path, '\\n',
          'Has Amendments:', has_amendments, '\\n',
          'Initial Master Agreement Expiry:', initial_master_agreement_expiry_date, '\\n',
          'Final Master Agreement Expiry:', final_expiry_date, '\\n',
          'Doc Info:\\n', combined_doc_info, '\\n',
          'Text:\\n', truncated, '\\n'
        ),0,{max_input_char}),
        responseFormat => '{response_format}'
      ), '{json_struct}'
      ) as metadata
    FROM (
      SELECT * EXCEPT(c.path, d.path)
      FROM {catalog}.{schema}.flat f
      LEFT JOIN {catalog}.{schema}.doc_info d
        ON f.path = d.path
      LEFT JOIN {catalog}.{schema}.classified c
        ON f.path = c.path
      LEFT ANTI JOIN {catalog}.{schema}.metadata m
        ON f.path = m.path
      -- WHERE c.is_master_agreement
      LIMIT CAST({batch_size} AS INT)
    )
  )
) AS source
ON target.path = source.path
WHEN NOT MATCHED THEN INSERT *;
"""

In [0]:
# Execute the query!
spark.sql(sql_query)

In [0]:
%sql
SELECT * FROM IDENTIFIER(:catalog || '.' || :schema || '.metadata')