# POC 2.0: AI_Query in Databricks
- AI_Query calls the LLM model of choice
- This notebook intelligently extract data from Customer Invoice PDF files into structured format
- The structured data is pushed to Delta table for analysis 

## Convert PDF Invoices to Delta table in Structured JSON format using AI_Query

In [0]:
SELECT
  ai_query(
    "databricks-claude-sonnet-4",
    "You are an invoice parser. Output only a compact JSON object with keys:
     InvoiceNumber, InvoiceDate, CustomerID, DueDate, BillTo, ShipTo,
     LineItems:[{Product,Description,Qty,UnitPrice,ExtendedPrice,SalesTax,Total}],
     Totals:{Subtotal,Tax,TotalInvoice}.
     Rules:
       - Get Invoice Number, we can infer the invoice number from the path provided at the beginning of your query 
         after dbfs:/Volumes/pdev/shaurya/documents/Invoices, it is between two underscores,
         Do NOT use 'Document details' (e.g., CON…).
       - For InvoiceDate, get it from Date: at the top, do not show any other text
       - Return only JSON, no commentary.
       - Extract all the line level information for eg: some invoice may have 120 line items
       - Do not display ```json at start of structured_invoice data and ``` at end"  
    || PATH 
    || CAST(ai_parse_document(content) AS STRING)
  ) AS structured_invoice
FROM READ_FILES('/Volumes/pdev/shaurya/documents/Invoices/', format => 'binaryFile')


structured_invoice
"{  ""InvoiceNumber"": ""PRINV0276804"",  ""InvoiceDate"": null,  ""CustomerID"": ""ACA1001"",  ""DueDate"": ""7/1/2022"",  ""BillTo"": ""Acadia Healthcare Company, Inc.\n6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""ShipTo"": null,  ""LineItems"": [  {  ""Product"": ""UltiPro Benefits Prime"",  ""Description"": ""Launch"",  ""Qty"": 1.00,  ""UnitPrice"": 84464.00,  ""ExtendedPrice"": 84464.00,  ""SalesTax"": 8235.24,  ""Total"": 92699.24  },  {  ""Product"": ""UltiPro Benefits Prime"",  ""Description"": ""Sub Fees (21000 EEs x $1.65 x 3 Mos)"",  ""Qty"": 1.00,  ""UnitPrice"": 103950.00,  ""ExtendedPrice"": 103950.00,  ""SalesTax"": 10135.13,  ""Total"": 114085.13  }  ],  ""Totals"": {  ""Subtotal"": 188414.00,  ""Tax"": 18370.37,  ""TotalInvoice"": 206784.37  } }"
"{  ""InvoiceNumber"": ""PRINV0243953"",  ""InvoiceDate"": null,  ""CustomerID"": ""ACA1001"",  ""DueDate"": ""2/13/2022"",  ""BillTo"": ""Acadia Healthcare Company, Inc.\n6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""ShipTo"": null,  ""LineItems"": [  {  ""Product"": ""CLK011122-Resource Treatment Center"",  ""Description"": ""Sub Fees (1 Clock x $70.00 x 3 Mos)"",  ""Qty"": 1.00,  ""UnitPrice"": 210.00,  ""ExtendedPrice"": 210.00,  ""SalesTax"": 14.70,  ""Total"": 224.70  }  ],  ""Totals"": {  ""Subtotal"": 210.00,  ""Tax"": 14.70,  ""TotalInvoice"": 224.70  } }"
"{  ""InvoiceNumber"": ""PRINV0248620"",  ""InvoiceDate"": ""2/10/2022"",  ""CustomerID"": ""ACA1001"",  ""DueDate"": ""3/12/2022"",  ""BillTo"": ""Acadia Healthcare Company, Inc.\n6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""ShipTo"": ""6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""LineItems"": [  {  ""Product"": ""ACA Distribution Services"",  ""Description"": ""Minimum Subscription Fee on 20000 EEs for Mar-22 - on 20000 May-22"",  ""Qty"": 60000.00,  ""UnitPrice"": 0.52,  ""ExtendedPrice"": 31200.00,  ""SalesTax"": 0.00,  ""Total"": 31200.00  },  {  ""Product"": ""Core - 4th Tier"",  ""Description"": ""Incremental Subscription Fee for Dec-21"",  ""Qty"": 10000.00,  ""UnitPrice"": 2.92,  ""ExtendedPrice"": 29200.00,  ""SalesTax"": 2847.00,  ""Total"": 32047.00  },  {  ""Product"": ""Core - 5th Tier"",  ""Description"": ""Incremental Subscription Fee for Dec-21"",  ""Qty"": 1049.00,  ""UnitPrice"": 2.69,  ""ExtendedPrice"": 2821.81,  ""SalesTax"": 275.13,  ""Total"": 3096.94  },  {  ""Product"": ""ACA Distribution Svcs - Tier 2"",  ""Description"": ""Incremental Subscription Fee for Nov-21"",  ""Qty"": 16056.00,  ""UnitPrice"": 0.41,  ""ExtendedPrice"": 6582.96,  ""SalesTax"": 0.00,  ""Total"": 6582.96  },  {  ""Product"": ""ACA Distribution Svcs - Tier 2"",  ""Description"": ""Incremental Subscription Fee for Dec-21"",  ""Qty"": 16049.00,  ""UnitPrice"": 0.41,  ""ExtendedPrice"": 6580.09,  ""SalesTax"": 0.00,  ""Total"": 6580.09  },  {  ""Product"": ""ACA Distribution Svcs - Tier 2"",  ""Description"": ""Incremental Subscription Fee for Jan-22"",  ""Qty"": 616.00,  ""UnitPrice"": 0.41,  ""ExtendedPrice"": 252.56,  ""SalesTax"": 0.00,  ""Total"": 252.56  }  ],  ""Totals"": {  ""Subtotal"": 76637.42,  ""Tax"": 3122.13,  ""TotalInvoice"": 79759.55  } }"
"{  ""InvoiceNumber"": ""PRINV0261111"",  ""InvoiceDate"": null,  ""CustomerID"": ""ACA1001"",  ""DueDate"": ""4/30/2022"",  ""BillTo"": ""Acadia Healthcare Company, Inc.\n6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""ShipTo"": ""6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""LineItems"": [  {  ""Product"": ""03/12/22-Tammy Fleming SR-2022-00347837-UTA - Rhode Island Overtime Law"",  ""Description"": """",  ""Qty"": ""3.75"",  ""UnitPrice"": ""190.00"",  ""ExtendedPrice"": ""712.50"",  ""SalesTax"": ""0.00"",  ""Total"": ""712.50""  }  ],  ""Totals"": {  ""Subtotal"": ""712.50"",  ""Tax"": ""0.00"",  ""TotalInvoice"": ""712.50""  } }"
"{  ""InvoiceNumber"": ""ARINV0095114"",  ""InvoiceDate"": ""1/13/2022"",  ""CustomerID"": ""ACA1001"",  ""DueDate"": ""1/13/2022"",  ""BillTo"": ""Acadia Healthcare Company, Inc.\n6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""ShipTo"": ""6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""LineItems"": [  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0111_2021_2022 0106_192938.utf"", ""Qty"": 392.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 392.00, ""SalesTax"": 38.22, ""Total"": 430.22},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0104_2021_2022 0106_174134.utf"", ""Qty"": 280.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 280.00, ""SalesTax"": 27.30, ""Total"": 307.30},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0133_2021_2022 0106_153931.utf"", ""Qty"": 386.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 386.00, ""SalesTax"": 37.64, ""Total"": 423.64},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0107_2021_2022 0106_171636.utf"", ""Qty"": 396.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 396.00, ""SalesTax"": 38.61, ""Total"": 434.61},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0140_2021_2022 0106_170211.utf"", ""Qty"": 4.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 4.00, ""SalesTax"": 0.39, ""Total"": 4.39},  {""Product"": ""Year End Print Services"", ""Description"": ""List of files..."", ""Qty"": 2430.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 2430.00, ""SalesTax"": 236.93, ""Total"": 2666.93},  {""Product"": ""Year End Print Services"", ""Description"": ""List of files..."", ""Qty"": 3060.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 3060.00, ""SalesTax"": 298.35, ""Total"": 3358.35},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0182_2021_2022 0106_143701.utf,ACA1001_24EE0_W2PRNT_0187_2021_20220106_171100.utf,ACA1001_24EE0_W2PRNT_0181_2021_20220106_175042.utf,ACA1001_24EE0_W2PRNT_0195_2021_20220106_143954.utf,ACA1001_24EE0_W2PRNT_0194_2021_20220106_180943.utf,ACA1001_24EE0_W2PRNT_0191_2021_20220106_133208.utf,ACA1001_24EE0_W2PRNT_0198_2021_20220106_200729.utf,ACA1001_24EE0_W2PRNT_0190_2021_20220106_192653.utf"", ""Qty"": 1643.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 1643.00, ""SalesTax"": 160.19, ""Total"": 1803.19},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0180_2021_2022 0106_184837.utf"", ""Qty"": 52.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 52.00, ""SalesTax"": 5.07, ""Total"": 57.07},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0157_2021_2022 0106_205305.utf"", ""Qty"": 86.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 86.00, ""SalesTax"": 8.39, ""Total"": 94.39},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0145_2021_2022 0106_170846.utf"", ""Qty"": 94.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 94.00, ""SalesTax"": 9.17, ""Total"": 103.17},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0504_2021_2022 0106_181504.utf,ACA1001_24EE0_W2PRNT_0403_2021_20220106_162345.utf,ACA1001_24EE0_W2PRNT_0601_2021_20220106_165959.utf,ACA1001_24EE0_W2PRNT_0820_2021_20220106_161755.utf,ACA1001_24EE0_W2PRNT_0603_2021_20220106_165056.utf,ACA1001_24EE0_W2PRNT_0701_2021_20220106_122259.utf,ACA1001_24EE0_W2PRNT_0703_2021_20220106_165737.utf,ACA1001_24EE0_W2PRNT_0503_2021_20220106_150916.utf,ACA1001_24EE0_W2PRNT_0400_2021_20220106_132320.utf,ACA1001_24EE0_W2PRNT_0501_2021_20220106_202437.utf"", ""Qty"": 2536.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 2536.00, ""SalesTax"": 247.26, ""Total"": 2783.26},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0136_2021_2022 0106_153228.utf"", ""Qty"": 49.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 49.00, ""SalesTax"": 4.78, ""Total"": 53.78},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_3422_2021_2022 0106_143908_utf,ACA1001_24EE0_W2PRNT_3427_2021_2022106_191614_utf,ACA1001_24EE0_W2PRNT_3436_2021_20220106_155108_utf,ACA1001_24EE0_W2PRNT_3428_2021_20220106_204829_utf,ACA1001_24EE0_W2PRNT_3465_2021_20220106_171106_utf,ACA1001_24EE0_W2PRNT_3301_2021_20220106_205534_utf,ACA1001_24EE0_W2PRNT_3431_2021_20220106_133128_utf,ACA1001_24EE0_W2PRNT_3434_2021_20220106_174540_utf,ACA1001_24EE0_W2PRNT_3430_2021_20220106_133010_utf,ACA1001_24EE0_W2PRNT_3466_2021_20220106_164851_utf,ACA1001_24EE0_W2PRNT_3435_2021_20220106_140506_utf,ACA1001_24EE0_W2PRNT_3433_2021_20220106_155154_utf,ACA1001_24EE0_W2PRNT_3426_2021_20220106_180836_utf,ACA1001_24EE0_W2PRNT_3429_2021_20220106_211456_utf,ACA1001_24EE0_W2PRNT_3421_2021_20220106_131552_utf,ACA1001_24EE0_W2PRNT_3433_2021_20220106_144905_utf,ACA1001_24EE0_W2PRNT_3433_2021_20220106_123715_utf,ACA1001_24EE0_W2PRNT_3420_2021_"", ""Qty"": 1711.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 1711.00, ""SalesTax"": 166.82, ""Total"": 1877.82},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0126_2021_2022 0106_182923_utf"", ""Qty"": 189.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 189.00, ""SalesTax"": 18.43, ""Total"": 207.43},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0174_2021_2022 0106_184135_utf"", ""Qty"": 299.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 299.00, ""SalesTax"": 29.15, ""Total"": 328.15},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0170_2021_2022 0106_173707_utf"", ""Qty"": 224.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 224.00, ""SalesTax"": 21.84, ""Total"": 245.84},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0150_2021_2022 0106_180552_utf"", ""Qty"": 170.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 170.00, ""SalesTax"": 16.58, ""Total"": 186.58},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0221_2021_2022 0106_143656_utf,ACA1001_24EE0_W2PRNT_0203_2021_20220106_162412_utf,ACA1001_24EE0_W2PRNT_0210_2021_20220106_123825_utf,ACA1001_24EE0_W2PRNT_0208_021_20220106_131043_utf,ACA1001_24EE0_W2PRNT_0205_2021_20220106_153325.utf,ACA1001_24EE0_W2PRNT_0250_2021_20220106_174609_utf,ACA1001_24EE0_W2PRNT_0251_2021_20220106_203433_utf,ACA1001_24EE0_W2PRNT_0207_2021_20220106_193837_utf"", ""Qty"": 3662.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 3662.00, ""SalesTax"": 357.05, ""Total"": 4019.05},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0302_2021_2022 0106_170218_utf,ACA1001_24EE0_W2PRNT_0355_2021_20220106_194315_utf,ACA1001_24EE0_W2PRNT_0308_2021_20220106_145603_utf,ACA1001_24EE0_W2PRNT_0306_2021_20220106_202909_utf,ACA1001_24EE0_W2PRNT_0300_2021_20220106_132325.utf,ACA1001_24EE0_W2PRNT_0303_2021_20220106_165559_utf,ACA1001_24EE0_W2PRNT_0310_2021_20220106_141154_utf,ACA1001_24EE0_W2PRNT_0301_2021_20220106_172328_utf"", ""Qty"": 2355.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 2355.00, ""SalesTax"": 229.61, ""Total"": 2584.61},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0161_2021_2022 0106_14354.utf"", ""Qty"": 268.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 268.00, ""SalesTax"": 26.13, ""Total"": 294.13},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0123_2021_2022 0106_195751.utf"", ""Qty"": 502.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 502.00, ""SalesTax"": 48.95, ""Total"": 550.95},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0130_2021_2022 0106_142150.utf"", ""Qty"": 294.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 294.00, ""SalesTax"": 28.67, ""Total"": 322.67},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0165_2021_2022 0106_131623.utf"", ""Qty"": 412.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 412.00, ""SalesTax"": 40.17, ""Total"": 452.17},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0167_2021_2022 0106_161512.utf"", ""Qty"": 146.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 146.00, ""SalesTax"": 14.24, ""Total"": 160.24},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0178_2021_2022 0106_145827.utf"", ""Qty"": 259.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 259.00, ""SalesTax"": 25.25, ""Total"": 284.25},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0131_2021_2022 0106_160711.utf"", ""Qty"": 504.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 504.00, ""SalesTax"": 49.14, ""Total"": 553.14},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0146_2021_2022 0106_140745.utf"", ""Qty"": 239.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 239.00, ""SalesTax"": 23.30, ""Total"": 262.30},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0139_2021_2022 0106_154722.utf"", ""Qty"": 205.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 205.00, ""SalesTax"": 19.99, ""Total"": 224.99},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0143_2021_2022 0106_141558.utf"", ""Qty"": 340.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 340.00, ""SalesTax"": 33.15, ""Total"": 373.15},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0121_2021_2022 0106_175825.utf"", ""Qty"": 167.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 167.00, ""SalesTax"": 16.28, ""Total"": 183.28},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0160_2021_2022 0106_201627.utf"", ""Qty"": 566.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 566.00, ""SalesTax"": 55.19, ""Total"": 621.19},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0101_2021_2022 0106_155634.utf"", ""Qty"": 497.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 497.00, ""SalesTax"": 48.46, ""Total"": 545.46},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0117_2021_2022 0106_173127.utf"", ""Qty"": 277.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 277.00, ""SalesTax"": 27.01, ""Total"": 304.01},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0127_2021_2022 0106_180148.utf"", ""Qty"": 264.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 264.00, ""SalesTax"": 25.74, ""Total"": 289.74},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0116_2021_2022 0106_183320.utf"", ""Qty"": 271.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 271.00, ""SalesTax"": 26.42, ""Total"": 297.42},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_CORP_2021_202 20106_133654.utf"", ""Qty"": 576.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 576.00, ""SalesTax"": 56.16, ""Total"": 632.16},  {""Product"": ""Year End Print Services"", ""Description"": ""ACA1001_24EE0_W2PRNT_0122_2021_2022 0106_150321.utf"", ""Qty"": 302.00, ""UnitPrice"": 1.00, ""ExtendedPrice"": 302.00, ""SalesTax"": 29.45, ""Total"": 331.45},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0111_2021_2022 0106_192938.utf"", ""Qty"": 1.00, ""UnitPrice"": 207.16, ""ExtendedPrice"": 207.16, ""SalesTax"": 20.20, ""Total"": 227.36},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0104_2021_2022 0106_174134.utf"", ""Qty"": 1.00, ""UnitPrice"": 148.40, ""ExtendedPrice"": 148.40, ""SalesTax"": 14.47, ""Total"": 162.87},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0133_2021_2022 0106_153931.utf"", ""Qty"": 1.00, ""UnitPrice"": 204.58, ""ExtendedPrice"": 204.58, ""SalesTax"": 19.95, ""Total"": 224.53},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0107_2021_2022 0106_171636.utf"", ""Qty"": 1.00, ""UnitPrice"": 209.88, ""ExtendedPrice"": 209.88, ""SalesTax"": 20.46, ""Total"": 230.34},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0140_2021_2022 0106_170211.utf"", ""Qty"": 1.00, ""UnitPrice"": 2.12, ""ExtendedPrice"": 2.12, ""SalesTax"": 0.21, ""Total"": 2.33},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_3211_2021_2022 0106_162321_utf,ACA1001_24EE0_W2PRNT_3262_2021_20220106_164616_utf,ACA1001_24EE0_W2PRNT_3130_2021_20220106_123123_utf,ACA1001_24EE_W2PRNT_3150_2021_20220106_194054_utf,ACA1001_24EE0_W2PRNT_3202_2021_20220106_131225_utf,ACA1001_24EE0_W2PRNT_3272_2021_2022106_122617_utf,ACA1001_24EE0_W2PRNT_3228_2021_20220106_184113_utf,ACA1001_24EE0_W2PRNT_3298_2021_20220106_171034_utf,ACA1001_24EE0_W2PRNT_3138_2021_20220106_123414_utf,ACA1001_24EE0_W2PRNT_3283_2021_20220106_140121_utf,ACA1001_24EE0_W2PRNT_3296_2021_20220106_134744_utf,ACA1001_24EE0_W2PRNT_3253_2021_20220106_170826_utf,ACA1001_24EE0_W2PRNT_3274_2021_20220106_122504_utf,ACA1001_24EE0_W2PRNT_3270_2021_20220106_203311_utf,ACA1001_24EE0_W2PRNT_3216_2021_20220106_201604_utf,ACA1001_24EE0_W2PRNT_3208_2021_20220106_204912_utf,ACA1001_24EE0_W2PRNT_3201_2021_20220106_173043_utf,ACA1001_24EE0_W2PRNT_3200_2021_"", ""Qty"": 1.00, ""UnitPrice"": 1287.90, ""ExtendedPrice"": 1287.90, ""SalesTax"": 125.57, ""Total"": 1413.47},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_3500_2021_2022 0106_185507_utf,ACA1001_24EE0_W2PRNT_3822_2021_20220106_130604_utf,ACA1001_24EE0_W2PRNT_3825_2021_20220106_135152_utf,ACA1001_24EE0_W2PRNT_3801_2021_20220106_155325_utf,ACA1001_24EE0_W2PRNT_3812_2021_20220106_122834.utf,ACA1001_24EE0_W2PRNT_3712_2021_20220106_195454_utf,ACA1001_24EE0_W2PRNT_3816_2021_20220106_122753_utf,ACA1001_24EE0_W2PRNT_3827_2021_20220106_140654_utf,ACA1001_24EE0_W2PRNT_3939_2021_20220106_195550_utf,ACA1001_24EE0_W2PRNT_3802_2021_20220106_130046_utf,ACA1001_24EE0_W2PRNT_3804_2021_20220106_144957_utf,ACA1001_24EE0_W2PRNT_3600_2021_20220106_134754_utf,ACA1001_24EE0_W2PRNT_3701_2021_20220106_212035_utf,ACA1001_24EE0_W2PRNT_3810_2021_20220106_211554_utf,ACA1001_24EE0_W2PRNT_3600_2021_20220106_185709_utf,ACA1001_24EE0_W2PRNT_3520_2021_20220106_152136_utf"", ""Qty"": 1.00, ""UnitPrice"": 1627.10, ""ExtendedPrice"": 1627.10, ""SalesTax"": 158.64, ""Total"": 1785.74},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0182_2021_2022 0106_143701.utf,ACA1001_24EE0_W2PRNT_0187_2021_20220106_171100.utf,ACA1001_24EE0_W2PRNT_0181_2021_20220106_175042.utf,ACA1001_24EE0_W2PRNT_0195_2021_20220106_143954.utf,ACA1001_24EE0_W2PRNT_0194_2021_20220106_180943.utf,ACA1001_24EE0_W2PRNT_0191_2021_20220106_133208.utf,ACA1001_24EE0_W2PRNT_0198_2021_20220106_200729.utf,ACA1001_24EE0_W2PRNT_0190_2021_20220106_192653.utf"", ""Qty"": 1.00, ""UnitPrice"": 870.79, ""ExtendedPrice"": 870.79, ""SalesTax"": 84.90, ""Total"": 955.69},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0180_2021_2022 0106_184837.utf"", ""Qty"": 1.00, ""UnitPrice"": 27.56, ""ExtendedPrice"": 27.56, ""SalesTax"": 2.69, ""Total"": 30.25},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0157_2021_2022 0106_205305.utf"", ""Qty"": 1.00, ""UnitPrice"": 45.58, ""ExtendedPrice"": 45.58, ""SalesTax"": 4.44, ""Total"": 50.02},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0145_2021_2022 0106_170846.utf"", ""Qty"": 1.00, ""UnitPrice"": 49.82, ""ExtendedPrice"": 49.82, ""SalesTax"": 4.86, ""Total"": 54.68},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0504_2021_2022 0106_181504.utf,ACA1001_24EE0_W2PRNT_0403_2021_20220106_163245.utf,ACA1001_24EE0_W2PRNT_0601_2021_20220106_165959.utf,ACA1001_24EE0_W2PRNT_0820_2021_20220106_161755.utf,ACA1001_24EE0_W2PRNT_0603_2021_20220106_165056.utf,ACA1001_24EE0_W2PRNT_0701_2021_20220106_122259.utf,ACA1001_24EE0_W2PRNT_0703_2021_20220106_165737.utf,ACA1001_24EE0_W2PRNT_0503_2021_20220106_150916.utf,ACA1001_24EE0_W2PRNT_0400_2021_20220106_132320.utf,ACA1001_24EE0_W2PRNT_0501_2021_20220106_202437.utf"", ""Qty"": 1.00, ""UnitPrice"": 1344.08, ""ExtendedPrice"": 1344.08, ""SalesTax"": 131.05, ""Total"": 1475.13},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0136_2021_2022 0106_153228.utf"", ""Qty"": 1.00, ""UnitPrice"": 25.97, ""ExtendedPrice"": 25.97, ""SalesTax"": 2.53, ""Total"": 28.50},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_3422_2021_2022..."", ""Qty"": 1.00, ""UnitPrice"": 906.83, ""ExtendedPrice"": 906.83, ""SalesTax"": 88.42, ""Total"": 995.25},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W2PRNT_0126_2021_2022..."", ""Qty"": 1.00, ""UnitPrice"": 100.17, ""ExtendedPrice"": 100.17, ""SalesTax"": 9.77, ""Total"": 109.94},  {""Product"": ""Shipping"", ""Description"": ""ACA1001_24EE0_W"
"{  ""InvoiceNumber"": ""ARINV0100598"",  ""InvoiceDate"": ""1/26/2022"",  ""CustomerID"": ""ACA1001"",  ""DueDate"": ""2/25/2022"",  ""BillTo"": ""Acadia Healthcare Company, Inc.\n6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""ShipTo"": ""201 Wellness Way Reading, PA 19605-8902 USA"",  ""LineItems"": [  {  ""Product"": ""0700-0521-1"",  ""Description"": ""UTB0521 Annual Maintenance (Purchase) Effective Date: 02/01/2022 Expiration Date: 01/31/2023 Serial Number: BR5R183260206 Clock Name: 0205-999-P-CLOCK1"",  ""Qty"": 1.00,  ""UnitPrice"": 340.00,  ""ExtendedPrice"": 340.00,  ""SalesTax"": 33.15,  ""Total"": 373.15  },  {  ""Product"": ""0700-0521-1"",  ""Description"": ""UTB0521 Annual Maintenance (Purchase) Effective Date: 02/01/2022 Expiration Date: 01/31/2023 Serial Number: BR5R183260435 Clock Name: 0205-999-P-CLOCK2"",  ""Qty"": 1.00,  ""UnitPrice"": 340.00,  ""ExtendedPrice"": 340.00,  ""SalesTax"": 33.15,  ""Total"": 373.15  },  {  ""Product"": ""0700-0521-1"",  ""Description"": ""UTB0521 Annual Maintenance (Purchase) Effective Date: 02/01/2022 Expiration Date: 01/31/2023 Serial Number: A908174860212 Clock Name: 3600-999-P-HR"",  ""Qty"": 1.00,  ""UnitPrice"": 340.00,  ""ExtendedPrice"": 340.00,  ""SalesTax"": 33.15,  ""Total"": 373.15  },  {  ""Product"": ""0700-0521-1"",  ""Description"": ""UTB0521 Annual Maintenance (Purchase) Effective Date: 02/01/2022 Expiration Date: 01/31/2023 Serial Number: BR5R192260256 Clock Name: 0121-999-P-ADMIN FACILITY"",  ""Qty"": 1.00,  ""UnitPrice"": 340.00,  ""ExtendedPrice"": 340.00,  ""SalesTax"": 33.15,  ""Total"": 373.15  },  {  ""Product"": ""0700-0521-1"",  ""Description"": ""UTB0521 Annual Maintenance (Purchase) Effective Date: 02/01/2022 Expiration Date: 01/31/2023 Serial Number: BR5R192260395 Clock Name: 0121-999-P-MAIN FACILITY"",  ""Qty"": 1.00,  ""UnitPrice"": 340.00,  ""ExtendedPrice"": 340.00,  ""SalesTax"": 33.15,  ""Total"": 373.15  },  {  ""Product"": ""0700-0521-1"",  ""Description"": ""UTB0521 Annual Maintenance (Purchase) Effective Date: 02/01/2022 Expiration Date: 01/31/2023 Serial Number: 6594173300112 Clock Name: 0198-999-N-READING MOB"",  ""Qty"": 1.00,  ""UnitPrice"": 340.00,  ""ExtendedPrice"": 340.00,  ""SalesTax"": 20.40,  ""Total"": 360.40  },  {  ""Product"": ""0700-0521-1"",  ""Description"": ""UTB0521 Annual Maintenance (Purchase) Effective Date: 02/01/2022 Expiration Date: 01/31/2023 Serial Number: BR5R201560368 Clock Name: 0198-999-P-Tower_2"",  ""Qty"": 1.00,  ""UnitPrice"": 340.00,  ""ExtendedPrice"": 340.00,  ""SalesTax"": 20.40,  ""Total"": 360.40  }  ],  ""Totals"": {  ""Subtotal"": 2380.00,  ""Tax"": 206.55,  ""TotalInvoice"": 2586.55  } }"
"{  ""InvoiceNumber"": ""ARINV0176838"",  ""InvoiceDate"": ""September 2022"",  ""CustomerID"": ""AAK1000"",  ""DueDate"": ""11/16/2022"",  ""BillTo"": ""AAK USA Inc.\n499 Thornall St\nATTN: Megan Griffault\nEdison, NJ 08837-2210\nUSA"",  ""ShipTo"": null,  ""LineItems"": [  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAKK1_20220961_20220912_161416.txt COID-AAK1"",  ""Qty"": 6.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 3.00,  ""SalesTax"": 0.20,  ""Total"": 3.20  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAKUS_20220961_20220913_155556.txt COID-AAKUS"",  ""Qty"": 9.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 4.50,  ""SalesTax"": 0.30,  ""Total"": 4.80  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAKFS_20220961_20220914_102129.txt COID-AAKFS"",  ""Qty"": 105.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 52.50,  ""SalesTax"": 3.48,  ""Total"": 55.98  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAKRIC_202209161_20220914_151427.txt COID-AAKRIC"",  ""Qty"": 26.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 13.00,  ""SalesTax"": 0.86,  ""Total"": 13.86  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAK2_20220901_20220926_145421.txt COID-AAK2"",  ""Qty"": 1.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 0.50,  ""SalesTax"": 0.03,  ""Total"": 0.53  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAK1_20220901_20220927_125513.txt COID-AAK1"",  ""Qty"": 4.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 2.00,  ""SalesTax"": 0.13,  ""Total"": 2.13  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAKRIC_202209301_20220928_135715.txt COID-AAKRIC"",  ""Qty"": 29.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 14.50,  ""SalesTax"": 0.96,  ""Total"": 15.46  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAKUS_20220901_20220928_100742.txt COID-AAKUS"",  ""Qty"": 10.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 5.00,  ""SalesTax"": 0.33,  ""Total"": 5.33  },  {  ""Product"": ""Checkprint-DDA"",  ""Description"": ""AAK1000_AAKUS_ULTIDDA_AAKFS_20220901_20220928_110457.txt COID-AAKFS"",  ""Qty"": 108.00,  ""UnitPrice"": 0.50,  ""ExtendedPrice"": 54.00,  ""SalesTax"": 3.58,  ""Total"": 57.58  }  ],  ""Totals"": {  ""Subtotal"": 149.00,  ""Tax"": 9.87,  ""TotalInvoice"": 158.87  } }"
"{  ""InvoiceNumber"": ""ARINV0104045"",  ""InvoiceDate"": null,  ""CustomerID"": ""ACA1001"",  ""DueDate"": ""3/4/2022"",  ""BillTo"": ""Acadia Healthcare Company, Inc.\n6100 Tower Circle Suite 1000\nATTN: Accounts Payable\nFranklin, TN 37067-1509\nUSA"",  ""ShipTo"": null,  ""LineItems"": [  {  ""Product"": ""Prepaid Maintenance All- PPM v3.0 All-In Jan-22 - Dec-22"",  ""Description"": """",  ""Qty"": ""1.00"",  ""UnitPrice"": ""14,165.73"",  ""ExtendedPrice"": ""14,165.73"",  ""SalesTax"": ""0.00"",  ""Total"": ""14,165.73""  }  ],  ""Totals"": {  ""Subtotal"": ""14,165.73"",  ""Tax"": ""0.00"",  ""TotalInvoice"": ""14,165.73""  } }"
"{  ""InvoiceNumber"": ""LIM1001"",  ""InvoiceDate"": ""17-SEP-2025"",  ""CustomerID"": ""LIM1001"",  ""DueDate"": ""17-OCT-2025"",  ""BillTo"": ""Limbach Facility Services, 797 Commonwealth Dr, Warrendale, PA 15086-7520, USA"",  ""ShipTo"": ""Limbach Facility Services, 797 Commonwealth Dr, Warrendale, PA 15086-7520, USA"",  ""LineItems"": [  {  ""Product"": ""UKG PRO COMMUNICATION BROADCAST-TEXT NOTIFICATIONS - Fixed"",  ""Description"": ""Contracted Minimum Fee"",  ""Qty"": 1.00,  ""UnitPrice"": 1200.00,  ""ExtendedPrice"": 1200.00,  ""SalesTax"": ""Yes"",  ""Total"": 1200.00  },  {  ""Product"": ""UKG PRO TALK - Active Employees"",  ""Description"": ""Contracted Minimum Fee"",  ""Qty"": 1450.00,  ""UnitPrice"": 1.50,  ""ExtendedPrice"": 2175.00,  ""SalesTax"": ""Yes"",  ""Total"": 2175.00  },  {  ""Product"": ""UKG PRO COMPENSATION - Active Employees"",  ""Description"": ""Contracted Minimum Fee"",  ""Qty"": 1450.00,  ""UnitPrice"": 1.59,  ""ExtendedPrice"": 2305.50,  ""SalesTax"": ""Yes"",  ""Total"": 2305.50  },  {  ""Product"": ""UKG PRO SUCCESSION - Active Employees"",  ""Description"": ""Contracted Minimum Fee"",  ""Qty"": 1450.00,  ""UnitPrice"": 1.59,  ""ExtendedPrice"": 2305.50,  ""SalesTax"": ""Yes"",  ""Total"": 2305.50  },  {  ""Product"": ""UKG PRO LIMITED ACCESS - Limited Access Employees"",  ""Description"": ""Contracted Minimum Fee"",  ""Qty"": 1.00,  ""UnitPrice"": 3.15,  ""ExtendedPrice"": 3.15,  ""SalesTax"": ""Yes"",  ""Total"": 3.15  },  {  ""Product"": ""UKG PRO EMPLOYEE VOICE - Active Employees"",  ""Description"": ""Contracted Minimum Fee"",  ""Qty"": 1450.00,  ""UnitPrice"": 1.59,  ""ExtendedPrice"": 2305.50,  ""SalesTax"": ""Yes"",  ""Total"": 2305.50  },  {  ""Product"": ""UKG PRO ONBOARDING/UKG PRO PERFORMANCE REVIEWS/UKG PRO RECRUITING/UKG PRO COACHING AND DEVELOPMENT/UKG PRO BASIC SSO/UKG PRO ACA SERVICES/UKG PRO HCM NPRD/UKG PRO HRSD PEOPLE ASSIST/UKG PRO HRSD BENEFITS ADMINISTRATION/UKG PRO PEOPLE CENTER/UKG PRO MODEL MY PAY/UKG PRO DATA EXCHANGE SERVICES - Active Employees"",  ""Description"": ""Contracted Minimum Fee"",  ""Qty"": 1450.00,  ""UnitPrice"": 38.10,  ""ExtendedPrice"": 55244.99,  ""SalesTax"": ""Yes"",  ""Total"": 55244.99  }  ],  ""Totals"": {  ""Subtotal"": 65539.64,  ""Tax"": 3868.61,  ""TotalInvoice"": 69408.25  } }"
"{  ""InvoiceNumber"": ""RES1015"",  ""InvoiceDate"": ""17-SEP-2025"",  ""CustomerID"": ""RES1015"",  ""DueDate"": ""17-OCT-2025"",  ""BillTo"": ""Resa Power, LLC, 8723 Fallbrook Dr, Houston, TX 77064-3318, USA"",  ""ShipTo"": ""Resa Power, LLC, 8723 Fallbrook Dr, Houston, TX 77064-3318, USA"",  ""LineItems"": [  {  ""Product"": ""UKG PRO COACHING AND DEVELOPMENT - Compensated Employees"",  ""Description"": ""Contracted Minimum Fee 01-Oct-25 - 31-Dec-25"",  ""Qty"": 1035.00,  ""UnitPrice"": 6.30,  ""ExtendedPrice"": 6520.50,  ""SalesTax"": 538.44,  ""Total"": 6520.50  },  {  ""Product"": ""UKG PRO CANADIAN PAY AND PEOPLE CENTER/UKG PRO PAY AND PEOPLE CENTER/UKG PRO TIME CLASSIC/UKG PRO PERFORMANCE REVIEWS/UKG PRO ONBOARDING/UKG PRO RECRUITING - Compensated Employees"",  ""Description"": ""Contracted Minimum Fee 01-Oct-25 - 31-Dec-25"",  ""Qty"": 1035.00,  ""UnitPrice"": 96.00,  ""ExtendedPrice"": 99359.99,  ""SalesTax"": 8197.20,  ""Total"": 99359.99  },  {  ""Product"": ""UKG PRO COMPENSATION - Compensated Employees"",  ""Description"": ""Contracted Minimum Fee 01-Oct-25 - 31-Dec-25"",  ""Qty"": 1035.00,  ""UnitPrice"": 9.30,  ""ExtendedPrice"": 9625.50,  ""SalesTax"": 794.10,  ""Total"": 9625.50  },  {  ""Product"": ""UKG PRO SUCCESSION - Compensated Employees"",  ""Description"": ""Contracted Minimum Fee 01-Oct-25 - 31-Dec-25"",  ""Qty"": 1035.00,  ""UnitPrice"": 3.75,  ""ExtendedPrice"": 3881.25,  ""SalesTax"": 320.20,  ""Total"": 3881.25  },  {  ""Product"": ""UKG HRSD DOCUMENT MANAGER - Compensated Employees"",  ""Description"": ""Contracted Minimum Fee 01-Oct-25 - 31-Dec-25"",  ""Qty"": 1035.00,  ""UnitPrice"": 9.39,  ""ExtendedPrice"": 9718.65,  ""SalesTax"": 801.79,  ""Total"": 9718.65  },  {  ""Product"": ""UKG PRO LEARNING - Compensated Employees"",  ""Description"": ""Contracted Minimum Fee 01-Oct-25 - 31-Dec-25"",  ""Qty"": 1035.00,  ""UnitPrice"": 12.45,  ""ExtendedPrice"": 12885.75,  ""SalesTax"": 1063.07,  ""Total"": 12885.75  },  {  ""Product"": ""UKG PRO EMPLOYEE VOICE - Compensated Employees"",  ""Description"": ""Contracted Minimum Fee 01-Oct-25 - 31-Dec-25"",  ""Qty"": 1035.00,  ""UnitPrice"": 6.30,  ""ExtendedPrice"": 6520.50,  ""SalesTax"": 538.44,  ""Total"": 6520.50  }  ],  ""Totals"": {  ""Subtotal"": 148512.14,  ""Tax"": 9801.78,  ""TotalInvoice"": 158313.92  } }"


In [0]:
SELECT

-- 1
  path,


-- 2
  -- JSON structured invoice
  ai_query(
    "databricks-claude-sonnet-4",
    "You are an invoice parser. Output only a compact JSON object with keys:
     InvoiceNumber, InvoiceDate, CustomerID, DueDate, BillTo, ShipTo,
     LineItems:[{Product,Description,Qty,UnitPrice,ExtendedPrice,SalesTax,Total}],
     Totals:{Subtotal,Tax,TotalInvoice}.
     Rules:
       - Get Invoice Number, we can infer the invoice number from the path provided at the beginning of your query after dbfs:/Volumes/pdev/shaurya/documents/Invoices, it is between two underscores,
         Do NOT use 'Document details' (e.g., CON…).
       - For InvoiceDate, get it from Date: at the top, do not show any other text
       - Return only JSON, no commentary.
       - Extract all the line level information for eg: some invoice may have 120 line items
       - Do not display ```json at start of structured_invoice data and ``` at end"  
    || PATH 
    || CAST(ai_parse_document(content) AS STRING)
  ) AS structured_invoice,


-- 3
  -- Freeform text summary
  ai_query(
    "databricks-meta-llama-3-3-70b-instruct",
    "Summarize this invoice document in plain text for a human reader. Document: "
      || CAST(ai_parse_document(content) AS STRING)
  ) AS raw_text
FROM READ_FILES('/Volumes/pdev/shaurya/documents/Invoices/', format => 'binaryFile')


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

## 01. Bronze Layer
#### Store data in Structured format from unstructured - PDF extract 

In [0]:
CREATE TABLE IF NOT EXISTS pdev.shaurya.Raw_Invoices_PDF_Extract (
    path STRING,
    structured_invoice STRING,
    raw_text STRING
)
USING DELTA
TBLPROPERTIES (
    delta.autoOptimize.optimizeWrite = true,
    delta.autoOptimize.autoCompact = true
);


-- Insert into Raw table
INSERT OVERWRITE shaurya.Raw_Invoices_PDF_Extract

SELECT
  path,
  -- JSON structured invoice data from Claude model
  ai_query(
    "databricks-claude-sonnet-4",
    "You are an invoice parser. Output only a compact JSON object with keys:
     InvoiceNumber, InvoiceDate, CustomerID, DueDate, BillTo, ShipTo,
     LineItems:[{Product,Qty,UnitPrice,ExtendedPrice,SalesTax,Total}],
     Totals:{Subtotal,Tax,TotalInvoice}.
     Rules:
       - Get Invoice Number, we can infer the invoice number from the path provided at the beginning of your query after dbfs:/Volumes/pdev/shaurya/documents/Invoices, it is between two underscores,
         Do NOT use 'Document details' (e.g., CON…).
       - For InvoiceDate, get it from Date: at the top, do not show any other text
       - Return only JSON, no commentary.
       - Extract all the line level information for eg: some invoice may have 120 line items
       - Do not display ```json at start of structured_invoice data and ``` at end"  
    || PATH 
    || CAST(ai_parse_document(content) AS STRING)
  ) AS structured_invoice,

  -- Freeform text summary from LLAMA model
  ai_query(
    "databricks-meta-llama-3-3-70b-instruct",
    "Summarize this invoice document in plain text for a human reader. Document: "
      || CAST(ai_parse_document(content) AS STRING)
  ) AS raw_text
FROM READ_FILES('/Volumes/pdev/shaurya/documents/Invoices/', format => 'binaryFile')


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
Select * from pdev.shaurya.Raw_Invoices_PDF_Extract

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

%md
## 02. Silver Layer for PDF Extracts
#### Flattern and Enrich data from Raw and EDW tables 

In [0]:
CREATE OR REPLACE TABLE shaurya.Silver_Invoices_PDF_Extract (
    Extracted_customer_id         STRING,
    EDW_Customer_Name             STRING,
    EDW_product_name              STRING,
    EDW_covered_product_name      STRING,
    Extracted_invoice_num         STRING,
    Extracted_invoice_date        STRING,
    Extracted_due_date            STRING,
    Extracted_bill_to             STRING,
    Extracted_ship_to             STRING,
    Extracted_Invoice_subtotal    DOUBLE,
    EDW_Invoice_tax               DOUBLE,
    Extracted_Invoice_tax         DOUBLE,
    EDW_Invoice_total             DOUBLE,
    Extracted_Invoice_total       DOUBLE,
    Extracted_line_product        STRING,
    Extracted_line_description    STRING,
    Extracted_line_qty            DOUBLE,
    Extracted_line_unit_price     DOUBLE,
    Extracted_line_extended_price DOUBLE,
    Extracted_line_sales_tax      DOUBLE,
    Extracted_line_total          DOUBLE
)
USING DELTA;


INSERT OVERWRITE shaurya.Silver_Invoices_PDF_Extract

WITH FINAL AS (
WITH pdf_parsed AS (
    SELECT
        structured_invoice,
        from_json(
            structured_invoice,
            'struct<
                InvoiceNumber:string,
                InvoiceDate:string,
                CustomerID:string,
                DueDate:string,
                BillTo:string,
                ShipTo:string,
                LineItems:array<struct<
                    Product:string,
                    Description:string,
                    Qty:double,
                    UnitPrice:double,
                    ExtendedPrice:double,
                    SalesTax:double,
                    Total:double
                >>,
                Totals:struct<
                    Subtotal:double,
                    Tax:double,
                    TotalInvoice:double
                >
            >'
        ) AS parsed
    FROM shaurya.Raw_Invoices_PDF_Extract
)
SELECT DISTINCT 
    parsed.InvoiceNumber                                                       AS invoice_num,
    parsed.InvoiceDate                                                         AS invoice_date,
    parsed.CustomerID                                                          AS customer_id,
    parsed.DueDate                                                             AS due_date,
    parsed.BillTo                                                              AS bill_to,
    parsed.ShipTo                                                              AS ship_to,
    parsed.Totals.Subtotal                                                     AS Invoice_subtotal,
    parsed.Totals.Tax                                                          AS Invoice_tax,
    parsed.Totals.TotalInvoice                                                 AS Invoice_total,
    li.Product                                                                 AS line_product,
    li.Description                                                             AS line_description,
    li.Qty                                                                     AS line_qty,
    li.UnitPrice                                                               AS line_unit_price,
    li.ExtendedPrice                                                           AS line_extended_price,
    li.SalesTax                                                                AS line_sales_tax,
    li.Total                                                                   AS line_total
FROM pdf_parsed
LATERAL VIEW explode(parsed.LineItems) li_tbl AS li
)

SELECT DISTINCT
    coalesce(cad.Cus_AR_Num ,pdf.customer_id)                                  AS Extracted_customer_id,
    cad.Cus_Acct_Name                                                          AS EDW_Customer_Name,
    pd.Prod_Name                                                               AS EDW_product_name,
    svcpd.Prod_Name                                                            AS EDW_covered_product_name,
    pdf.invoice_num                                                            AS Extracted_invoice_num,
    coalesce(invoice_date.Calendar_Date ,pdf.invoice_date)                     AS Extracted_invoice_date,
    pdf.due_date                                                               AS Extracted_due_date,
    pdf.bill_to                                                                AS Extracted_bill_to,
    pdf.ship_to                                                                AS Extracted_ship_to,
    pdf.Invoice_subtotal                                                       AS Extracted_Invoice_subtotal,
    sum(f.Invoice_Ln_Tax_Amt_TC)                                               AS EDW_Invoice_tax,
    pdf.Invoice_tax                                                            AS Extracted_Invoice_tax,
    f.Total_Invoice_Amt_TC                                                     AS EDW_Invoice_total,
    pdf.Invoice_total                                                          AS Extracted_Invoice_total,
    pdf.line_product                                                           AS Extracted_line_product,
    pdf.line_description                                                       AS Extracted_line_description,
    pdf.line_qty                                                               AS Extracted_line_qty,
    pdf.line_unit_price                                                        AS Extracted_line_unit_price,
    pdf.line_extended_price                                                    AS Extracted_line_extended_price,
    pdf.line_sales_tax                                                         AS Extracted_line_sales_tax,
    pdf.line_total                                                             AS Extracted_line_total
FROM final pdf 
LEFT JOIN edw.customer_invoice_line_fact f
       ON f.invoice_num = pdf.invoice_num
LEFT JOIN edw.customer_account_dim cad 
       ON f.customer_account_dim_id = cad.customer_account_dim_id 
LEFT JOIN edw.product_dim pd 
       ON f.product_dim_id = pd.product_dim_id 
LEFT JOIN edw.product_dim svcpd 
       ON f.covered_product_dim_id = svcpd.product_dim_id
LEFT JOIN edw.date_dim invoice_date 
       ON f.Invoice_Document_Date_Dim_Id = invoice_date.date_dim_id 
group by ALL
ORDER BY Extracted_customer_id, Extracted_invoice_num

;


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
Select * from shaurya.silver_invoices_pdf_extract

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

## 02. Silver Layer for Wikipedia Extracts
#### Get Customer insights from online wikipedia search

In [0]:
%python
%pip install requests

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%python
dbutils.library.restartPython()

import requests
import urllib3

dbutils.library.restartPython()


In [0]:
%python
import requests

In [0]:
%python
# Suppress SSL warnings since we're using verify=False
# urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def search_wikipedia_and_get_content(search_term):
    search_url = "https://en.wikipedia.org/w/api.php"

    headers = {
        "User-Agent": "UKG-InvoiceParser/1.0 (shaurya.rawat@ukg.com)"
    }

    # Step 1: Search for the term
    search_params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": search_term
    }
    search_response = requests.get(
        search_url,
        params=search_params,
        headers=headers,
        verify=False,       # bypass SSL verification
        timeout=10
    ).json()

    if not search_response["query"]["search"]:
        return "No results found for your search term."

    page_title = search_response["query"]["search"][0]["title"]

    # Step 2: Get the page content
    content_params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": page_title,
        "exlimit": 1, # makes serach in one full page
        "explaintext": 1 
    }
    content_response = requests.get(
        search_url,
        params=content_params,
        headers=headers,
        verify=False,       # bypass SSL verification
        timeout=10
    ).json()

    # Extract the page content
    page_id = list(content_response["query"]["pages"].keys())[0]
    return content_response["query"]["pages"][page_id]["extract"]


In [0]:
%python
search_wikipedia_and_get_content('Padagis US LLC')




'No results found for your search term.'

In [0]:
%python
search_wikipedia_and_get_content('ASCENT LIVING COMMUNITIES')



'Alex Honnold (born August 17, 1985) is an American rock climber best known for his free solo ascents of big walls. Honnold rose to worldwide fame in June 2017 when he became the first person to free solo a full route on El Capitan in Yosemite National Park (via the 2,900-foot route Freerider at 5.13a, the first-ever big wall free solo ascent at that grade), a climb described in The New York Times as "one of the great athletic feats of any kind, ever." In 2015, he won a Piolet d\'Or in alpine climbing with Tommy Caldwell for their completion of the enchainment (known as the Fitz Traverse) of the Cerro Chaltén Group (or Fitzroy Group) in Patagonia over 5 days.\nHonnold is the author (with David Roberts) of the memoir Alone on the Wall (2015) and the subject of the 2018 biographical documentary Free Solo,  which won a BAFTA and an Academy Award.\n\n\n== Early life and education ==\nHonnold was born in Sacramento, California, the son of community college professor Dierdre Wolownick (b. 19

In [0]:
%python
"""Example of Python client calling Knowledge Graph Search API."""
import json
import urllib.parse
import urllib.request

api_key = "AIzaSyDA-6m9oTso-LoR42zqTTCbEwp_fhD5C8k"  # <-- your API key
query = "ASCENT LIVING COMMUNITIES revenue"
service_url = "https://kgsearch.googleapis.com/v1/entities:search"

params = {
    "query": query,
    "limit": 5,
    "indent": "true",
    "key": api_key,
}

# Build the URL with urlencode from urllib.parse
url = service_url + "?" + urllib.parse.urlencode(params)

# Use urllib.request to open the URL
with urllib.request.urlopen(url) as response:
    data = json.loads(response.read())

display(data)


{'@context': {'kg': 'http://g.co/kg',
  'resultScore': 'goog:resultScore',
  'EntitySearchResult': 'goog:EntitySearchResult',
  '@vocab': 'http://schema.org/',
  'goog': 'http://schema.googleapis.com/',
  'detailedDescription': 'goog:detailedDescription'},
 '@type': 'ItemList',
 'itemListElement': []}

In [0]:
CREATE OR REPLACE TABLE shaurya.Silver_Wikipedia_Customer_Info (
    Extracted_customer_id         STRING,
    EDW_Customer_Name             STRING,
    Wikipedia_industry            STRING,
    Wikipedia_facilities_count    STRING,
    Wikipedia_employee_count      STRING,
    Wikipedia_country_count       STRING,
    Wikipedia_revenue             STRING,
    Wikipedia_category            STRING,
    Wikipedia_competitors         STRING
)
USING DELTA;

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
%python
import re

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
%python

def extract_company_name(bill_to: str):
    if not bill_to or not isinstance(bill_to, str):
        return None

    # Normalize spaces
    text = bill_to.strip()

    # Look for company suffixes (Inc, LLC, Ltd, Corp, Co etc.)
    match = re.search(r'\b(?:Inc|LLC|Ltd|Corp|Co)\b', text, flags=re.IGNORECASE)
    if match:
        # Keep everything up to and including the suffix
        end_idx = match.end()
        return text[:end_idx].strip()

    # Then split on punctuation (: , .) and take the first part
    text = re.split(r'[:.,]', text)[0]

    return text.strip()


# Load the table into a Spark DataFrame
spark_df = spark.table("shaurya.Silver_Invoices_PDF_Extract") \
                .select("Extracted_customer_id", "EDW_Customer_Name", "Extracted_bill_to") \
                .distinct()

# Convert to Pandas for iteration
pdf = spark_df.toPandas()

# Build the new DataFrame with Wikipedia data
wiki_df = spark.createDataFrame([
    (
        row["Extracted_customer_id"],
        row["EDW_Customer_Name"],
        row["Extracted_bill_to"],
        extract_company_name(row["Extracted_bill_to"]),
        search_wikipedia_and_get_content(extract_company_name(row["Extracted_bill_to"]))
    )
    for _, row in pdf.iterrows()
], ["customer_id", "customer_name", "bill_to", "wikipedia_text"])

# Save as temp view
wiki_df.createOrReplaceTempView("staging_customer_wiki_data")


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
Select * from staging_customer_wiki_data

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
INSERT OVERWRITE TABLE shaurya.Silver_Wikipedia_Customer_Info
SELECT
  customer_id                                             AS Extracted_customer_id,
  customer_name                                           AS EDW_Customer_Name,
  get_json_object(structured_info, '$.industry')          AS Wikipedia_industry,
  get_json_object(structured_info, '$.facilities_count')  AS Wikipedia_facilities_count,
  get_json_object(structured_info, '$.employee_count')    AS Wikipedia_employee_count,
  get_json_object(structured_info, '$.country_count')     AS Wikipedia_country_count,
  get_json_object(structured_info, '$.revenue')           AS Wikipedia_revenue,
  category_info                                           AS Wikipedia_category,
  get_json_object(structured_info, '$.competitors')       AS Wikipedia_competitors
FROM (
  SELECT
    customer_id,
    customer_name,
    ai_query(
      "databricks-meta-llama-3-3-70b-instruct",
      "You are extracting structured company info. 

       Extract into JSON with fields:
         - industry
         - facilities_count
         - employee_count
         - country_count
         - competitors
         - revenue

       - Industry is the primary work of company like software solution, Consulting service, healthcare, retail store, fitness technology etc
       - employee count is number of employees that the company has overall globally
       - country count is number of countries that the company is operating in
       - competitors are the alternative solutions that the company is competing with in same industry
       - get revenue of company in format for eg billion usd
       - Only return JSON data (no commentary, no ```json fences)

       Text: " || wikipedia_text
    ) AS structured_info,
    
    ai_query(
    "databricks-meta-llama-3-3-70b-instruct",
    "Classify this company into a single category based on its overall size and reach. 
     Categories:
      - Small Business (< 500 employees)
      - Mid-Market (500 – 5,000 employees)
      - Enterprise (5,000 – 50,000 employees or global revenue > 10 billion dollar)
      - Global Enterprise (> 50,000 employees or operates in multiple continents)

      - Only Return only the category name (Small Business, Mid-Market, Enterprise, Global Enterprise) based on company, not parent company
      - Do not show any other content
      Text: " || wikipedia_text
    ) AS category_info

  FROM staging_customer_wiki_data
)


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
Select * from shaurya.silver_wikipedia_customer_info

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

%md
## 03. Gold Layer for Customer Insights
#### Get Customer insights from Silver layer (Wikipedia and Invoice PDF's)

In [0]:
CREATE VIEW shaurya.gold_customer_insight_view AS (
SELECT 
  pdf_inv.*,
  wiki_insight.Wikipedia_industry,
  wiki_insight.Wikipedia_facilities_count,
  wiki_insight.Wikipedia_employee_count,
  wiki_insight.Wikipedia_country_count,
  wiki_insight.Wikipedia_competitors,
  wiki_insight.Wikipedia_revenue,
  wiki_insight.Wikipedia_category
FROM shaurya.silver_invoices_pdf_extract pdf_inv
LEFT JOIN shaurya.silver_wikipedia_customer_info wiki_insight
  ON pdf_inv.Extracted_customer_id = wiki_insight.Extracted_customer_id
)

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data

In [0]:
SELECT * FROM shaurya.gold_customer_insight_view

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data