In [17]:
import re

# Raw input text
text = """Olde Thompson Granulated Ginger, 4 lb
Item 1453521  $20.89
1
1
Delivered
$20.89
$20.89
Olde Thompson Nutmeg, 20.1 oz
Item 1833299  $12.19
1
1
Delivered
$12.19
$12.19
Organic Pure Cane Sugar, Brown, 7.5 lbs
Item 1582259  $11.59
2
2
Delivered
$23.18
$23.18
Darigold Classic Cage Free Egg Nog, 64 ﬂ oz
Item 1754420  $6.99
2
2
Delivered
$13.98
$13.98
12/19/25, 9:49 PM Orders & Purchases
https://www.costcobusinessdelivery.com/OrderDetailPrintView?orderId=1243757654 1/2
Organic Sweet Potatoes, 5 lbs
Item 2003  $6.19
1
1
Delivered
$6.19
$6.19
Honey Maid Graham Crackers, Honey, 14.4 oz, 4 ct
Item 204529  $11.59
2
2
Delivered
$23.18
$23.18"""

# Your regex pattern
pattern = r"\n(?=[A-Za-z][^\n]*?\nItem\s+\d+\s+\$?\d+\.\d{2})"

# Compile for clarity and reuse
p_li_split = re.compile(pattern)

# Apply split
chunks = re.split(p_li_split, text)

# Display results
for i, chunk in enumerate(chunks, start=1):
    print(f"\n--- Line Item {i} ---")
    print(chunk.strip())



--- Line Item 1 ---
Olde Thompson Granulated Ginger, 4 lb
Item 1453521  $20.89
1
1
Delivered
$20.89
$20.89

--- Line Item 2 ---
Olde Thompson Nutmeg, 20.1 oz
Item 1833299  $12.19
1
1
Delivered
$12.19
$12.19

--- Line Item 3 ---
Organic Pure Cane Sugar, Brown, 7.5 lbs
Item 1582259  $11.59
2
2
Delivered
$23.18
$23.18

--- Line Item 4 ---
Darigold Classic Cage Free Egg Nog, 64 ﬂ oz
Item 1754420  $6.99
2
2
Delivered
$13.98
$13.98
12/19/25, 9:49 PM Orders & Purchases
https://www.costcobusinessdelivery.com/OrderDetailPrintView?orderId=1243757654 1/2

--- Line Item 5 ---
Organic Sweet Potatoes, 5 lbs
Item 2003  $6.19
1
1
Delivered
$6.19
$6.19

--- Line Item 6 ---
Honey Maid Graham Crackers, Honey, 14.4 oz, 4 ct
Item 204529  $11.59
2
2
Delivered
$23.18
$23.18


In [None]:
import re
from pprint import pprint

# -----------------------------
# Input text
# -----------------------------
block_text = """Olde Thompson Granulated Ginger, 4 lb
Item 1453521  $20.89
1
1
Delivered
$20.89
$20.89
Olde Thompson Nutmeg, 20.1 oz
Item 1833299  $12.19
1
1
Delivered
$12.19
$12.19
Organic Pure Cane Sugar, Brown, 7.5 lbs
Item 1582259  $11.59
2
2
Delivered
$23.18
$23.18
Darigold Classic Cage Free Egg Nog, 64 ﬂ oz
Item 1754420  $6.99
2
2
Delivered
$13.98
$13.98
12/19/25, 9:49 PM Orders & Purchases
https://www.costcobusinessdelivery.com/OrderDetailPrintView?orderId=1243757654 1/2
Organic Sweet Potatoes, 5 lbs
Item 2003  $6.19
1
1
Delivered
$6.19
$6.19
Honey Maid Graham Crackers, Honey, 14.4 oz, 4 ct
Item 204529  $11.59
2
2
Delivered
$23.18
$23.18"""

# -----------------------------
# Split regex (your provided one)
# -----------------------------
p_li_split = re.compile(
    r"\n(?=[A-Za-z][^\n]*?\nItem\s+\d+\s+\$?\d+\.\d{2})"
)

# -----------------------------
# Extraction regexes
# -----------------------------
p_li_qty = re.compile(
    r"(?:\nItem\s+\d+\s+\$?\d+\.\d{2}\n)(\d+)\n",
    re.MULTILINE
)

p_li_desc = re.compile(
    r"^([^\n]+?)(?:,\s*(?:(?:1 )?Gallon,\s*\d+\s*ct|\d+\.?\d*\s*(?:lbs|ct|oz|kg|g|L|ml)))?\nItem",
    re.MULTILINE
)

p_li_unit = re.compile(
    r"(?:[^\n]+?)(?:,\s*((?:1 )?Gallon,\s*\d+\s*ct|\d+\.?\d*\s*(?:lbs|ct|oz|kg|g|L|ml)))?\nItem",
    re.MULTILINE
)

p_li_price = re.compile(
    r"Item\s+\d+\s+\$?(\d+\.\d{2})\n"
)

p_li_total = re.compile(
    r"(?:Delivered|Cancelled)\n\$(\d+\.\d{2})"
)

# -----------------------------
# Helper extractor
# -----------------------------
def extract_val(pattern, text):
    m = pattern.search(text)
    return m.group(1) if m else None

# -----------------------------
# Split into chunks
# -----------------------------
chunks = re.split(p_li_split, block_text)

line_items = []

for idx, chunk in enumerate(chunks, start=1):
    chunk = chunk.strip()
    if not chunk:
        continue

    print(f"\n================ CHUNK {idx} ================\n")
    print(chunk)
    print("\n---------------------------------------------")

    # Normalize description AFTER extraction
    description = extract_val(p_li_desc, chunk)
    if description:
        description = re.sub(r"\s*\n\s*", " ", description).strip()

    item = {
        "quantity": extract_val(p_li_qty, chunk),
        "description": description,
        "unit": extract_val(p_li_unit, chunk),
        "unit_price": extract_val(p_li_price, chunk),
        "line_total": extract_val(p_li_total, chunk),
    }

    print("Extracted fields:")
    pprint(item)

    if item["description"] and (item["line_total"] or item["unit_price"]):
        line_items.append(item)

# -----------------------------
# Final parsed line items
# -----------------------------
print("\n================ FINAL LINE ITEMS ================\n")
pprint(line_items)




Olde Thompson Granulated Ginger, 4 lb
Item 1453521  $20.89
1
1
Delivered
$20.89
$20.89

---------------------------------------------
Extracted fields:
{'description': 'Olde Thompson Granulated Ginger, 4 lb',
 'line_total': '20.89',
 'quantity': '1',
 'unit': None,
 'unit_price': '20.89'}


Olde Thompson Nutmeg, 20.1 oz
Item 1833299  $12.19
1
1
Delivered
$12.19
$12.19

---------------------------------------------
Extracted fields:
{'description': 'Olde Thompson Nutmeg',
 'line_total': '12.19',
 'quantity': '1',
 'unit': None,
 'unit_price': '12.19'}


Organic Pure Cane Sugar, Brown, 7.5 lbs
Item 1582259  $11.59
2
2
Delivered
$23.18
$23.18

---------------------------------------------
Extracted fields:
{'description': 'Organic Pure Cane Sugar, Brown',
 'line_total': '23.18',
 'quantity': '2',
 'unit': None,
 'unit_price': '11.59'}


Darigold Classic Cage Free Egg Nog, 64 ﬂ oz
Item 1754420  $6.99
2
2
Delivered
$13.98
$13.98
12/19/25, 9:49 PM Orders & Purchases
https://www.costcobusin