# 03_llm_extraction.ipynb

# LLM Enhancement
To improve the accuracy of invoice data extraction, we introduce an LLM-based correction layer over the existing OCR and regex pipeline. The system first processes documents through OCR—using native text extraction for PDFs and EasyOCR for images—and standardizes the results via regex before storing them in CSV format.

**Option A** uses a text-based LLM, *Gemini* , which receives the OCR output and regex-cleaned data. It reformats the extracted fields into a structured table format for each invoice and outputs a confidence score. A validation script checks for missing or invalid fields; if confidence is low or errors are detected, **Option B** is triggered.

**Option B** employs a visual-text LLM, such as *Gemini*, which takes the same input as Option A along with the document image path to refine and correct the extracted data.


In [3]:
import os
import pandas as pd

# Define base paths 
root_folder = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_folder = os.path.join(root_folder, "data", "processed")

# Create new folders 
llm_folder = os.path.join(processed_folder, "llm")

os.makedirs(llm_folder, exist_ok=True)

# Define file paths 
llm_invoices_path = os.path.join(llm_folder, "llm_invoices.csv")
llm_lineitems_path = os.path.join(llm_folder, "llm_lineitems.csv")



# Define column structures 
invoices_columns = ["file_path", "invoice_id", "vendor", "date", "total", "invoice_number"]
lineitems_columns = ["file_path", "invoice_id", "description", "quantity", "unit_price", "total"]

# Create empty DataFrames and save as CSVs 
pd.DataFrame(columns=invoices_columns).to_csv(llm_invoices_path, index=False)
pd.DataFrame(columns=lineitems_columns).to_csv(llm_lineitems_path, index=False)


print("Folders and CSV files created successfully:")
print(f"- {llm_invoices_path}")
print(f"- {llm_lineitems_path}")

Folders and CSV files created successfully:
- c:\Stealth AI\Clean Reader\data\processed\llm\llm_invoices.csv
- c:\Stealth AI\Clean Reader\data\processed\llm\llm_lineitems.csv


# LLM Rationale
We first use the Google Gemini 1.5 Flash API (text) since it’s less compute-intensive. If confidence is low or regex rules fail (e.g., missing values or letters in numeric fields), we fall back to the Gemini 1.5 Flash API (vision + LLM). This approach reserves paid tokens for complex cases, with the confidence threshold adjustable based on invoice difficulty or readability.

In [1]:
import os
os.environ["GEMINI_API_KEY"] = "My key here"

---------------------------

In [None]:
import os
import re
import json
import google.generativeai as genai

MODEL_NAME = "gemini-2.5-flash"

# --- Initialize Gemini client safely ---
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise EnvironmentError("Missing GEMINI_API_KEY. Please set it as an environment variable.")

genai.configure(api_key=api_key)


def _clean_json_output(raw_text: str) -> str:
    """
    Cleans Gemini output by removing markdown code fences (```json, ```),
    stripping whitespace, and ensuring only valid JSON remains.
    """
    if not raw_text:
        raise ValueError("Model returned empty response.")

    # Remove markdown-style code fences
    cleaned = re.sub(r"^```(?:json)?|```$", "", raw_text.strip(), flags=re.MULTILINE).strip()
    return cleaned


def extract_invoice_json(clean_text: str, invoice_data: list, lineitems_data: list) -> dict:
    """
    Send cleaned invoice text + regex-extracted data to Gemini and return structured JSON.
    """
    prompt = f"""
You are an expert AI that extracts structured invoice data from semi-structured text.

Rules:
1. Output ONLY valid JSON, no explanations.
2. Do NOT wrap JSON in markdown code fences (no ```json or ```).
3. Each line item may contain a description, size, and brand together — separate them correctly.
4. Make sure description doesn't include size or company/brand name. Size and brand should be separate fields.
5. Include a confidence score between 0 and 1 based on extraction reliability.
6. If a field is missing, use null.
7. Ensure totals match quantity * unit_price if possible, otherwise keep the total from the invoice text.
8. Do not include fuel surcharges or taxes as regular line items.
9. Use the following schema strictly:

{{
  "llm_invoices": {{
    "file_path": str,
    "vendor": str,
    "date": str,
    "total": float,
    "invoice_number": str
  }},
  "llm_lineitems": [
    {{
      "invoice_number": str,
      "description": str,
      "quantity": float or null,
      "unit_price": float or null,
      "total": float or null
    }}
  ],
  "confidence": float
}}

Cleaned text:
{clean_text}

Regex invoice data:
{json.dumps(invoice_data, indent=2)}

Regex line items data:
{json.dumps(lineitems_data, indent=2)}
"""

    # --- Call Gemini model ---
    model = genai.GenerativeModel(MODEL_NAME)
    response = model.generate_content(prompt)
    raw_text = response.text.strip() if response and response.text else ""

    # --- Clean & parse JSON ---
    cleaned = _clean_json_output(raw_text)

    try:
        parsed_json = json.loads(cleaned)
        return parsed_json
    except json.JSONDecodeError as e:
        raise ValueError(
            f"Model did not return valid JSON even after cleaning. "
            f"Error: {e}\nRaw output:\n{raw_text}"
        )


# --- Example Usage ---
# if __name__ == "__main__":
#     clean_text = """pacific flood 379183 mporters inc 18620 80th court south, bldg: f kent, wa 980
# www-pacificfoodimporters.com purchase order no terms order no cust id order date | date sales rep net 7 days
# 08/04/2025 08/05/2025 18
# sold to: westmans bagel { caffe dba
# ship to: westmans bagel { caffe dba tetia llc # 1 tetia llc 2925 223rd pl sw 5201 university way ne b brier , wa usa seattlebrier , wa usa
# 5 fax: routelstop: 03-tu 6 ship via: of
# 1 product id ordered shipped description size brand st gross wt price per amount
# 102950 8 000 8 . ooo/cs flour power 24 . 063 cs 192 50 graincraft
# 157301 1.000 1 000 cs sesam seeds white+ 4 x 80 _ 250 cs 80 25 marca croc
# 33425 1.000 1.000 cs jalapeno sliced 6 # 51.329 cs 51.33 savor
# 78825 x 1.000 1.000 ea currants 29.203 ea 29 20
# 3210 3 . 000 3 . 0o/cs milk oat 12 x 32 oz 84 . 39.948 cs 119.84 oatley
# 109950 1.000 1.oo0/cs sugar granulated cane 26 . 28 . 587 cs 28 59 ceh sugar
# 191928 1_ 000 1_ ooolcs eggs liquid 2 x 95.225 cs 95.23 papettis
# total weight] 626 _ 67 sub total 596 _ 94 : st = status code: discount taxed t sub s freight tax total cash check# charge paid on acct total 596.94"""

#     invoice_data = [
#         {
#             "file_path": r"C:\Stealth AI\Clean Reader\data\raw\train\Copy of ARPFIINVOEBTCHLASER (1).jpg",
#             "invoice_id": 379183,
#             "vendor": "pacificfoodimporters",
#             "date": "08/05/2025",
#             "total": 596.94,
#             "invoice_number": "379183"
#         }
#     ]

#     lineitems_data = [
#         {"description": "063 192 50 graincraft 157301", "quantity": 0, "unit_price": None, "total": 1.00},
#         {"description": "80 25 marca croc 33425", "quantity": 250, "unit_price": 0.0, "total": 1.00},
#         {"description": "jalapeno sliced 6", "quantity": 1, "unit_price": 51.32, "total": 51.32},
#         {"description": "", "quantity": 9, "unit_price": 5.7, "total": 51.33},
#         {"description": "x 78825", "quantity": 0, "unit_price": 0.0, "total": 1.00},
#         {"description": "currants", "quantity": 1, "unit_price": 29.2, "total": 29.20},
#         {"description": "", "quantity": 39, "unit_price": 3.07, "total": 119.84},
#         {"description": "x 2", "quantity": 47.61, "unit_price": None, "total": 95.22},
#         {"description": "", "quantity": 5, "unit_price": 19.05, "total": 95.23}
#     ]

#     result_json = extract_invoice_json(clean_text, invoice_data, lineitems_data)
#     print(json.dumps(result_json, indent=2))



In [17]:

import os
import re
import json
import mimetypes
import google.generativeai as genai

def visual_extract_invoice_json(existing_json: dict) -> dict:
    """
    Takes the JSON output from extract_invoice_json() and the invoice image/PDF (from file_path).
    Uses Gemini multimodal vision model to correct missing or incorrect fields.

    Args:
        existing_json (dict): JSON from extract_invoice_json().

    Returns:
        dict: Corrected JSON, same schema.
    """
    # --- Validate file path from JSON ---
    file_path = (
        existing_json.get("llm_invoices", {}).get("file_path")
        if existing_json.get("llm_invoices")
        else None
    )
    if not file_path or not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found or invalid: {file_path}")

    # --- Load file for Gemini ---
    mime_type, _ = mimetypes.guess_type(file_path)
    if mime_type is None:
        mime_type = "image/jpeg"  # fallback default
    file_obj = genai.upload_file(file_path, mime_type=mime_type)

    # --- Construct correction prompt ---
    prompt = f"""
You are an expert invoice extraction AI.
You are given a JSON that was generated from an OCR process. 
The JSON may have missing fields or incorrect values.

Your task:
1. Use the provided invoice image/PDF to **verify and correct** all data.
2. Make sure **no field is null or missing** if the information is visible in the document.
3. Keep the same JSON schema.
4. If a field is not visible at all, use null, not a guess.
5. In line items, the "description" field may incorrectly include product size or brand name.
   - Correct format: description / size / brand
   - Only the **first part (product name)** should go in "description".
   - The rest (size, brand) should NOT appear in the "description" field.
   - Example corrections:
       * "flour power 50 lb graincraft" → description="flour power", size="50 lb", brand="graincraft"
       * "jalapeno sliced 6#10 savor" → description="jalapeno sliced", size="6#10", brand="savor"
   - For this schema, **only keep description, quantity, unit_price, total**.
     (size and brand need not appear unless your schema includes them explicitly.)
6. Ensure totals = quantity × unit_price wherever possible.
7. Recalculate confidence between 0 and 1 based on correction reliability.
8. Output **only valid JSON** (no markdown or commentary).

Here is the current JSON to correct:
{json.dumps(existing_json, indent=2)}
"""

    # --- Call Gemini multimodal model ---
    model = genai.GenerativeModel(MODEL_NAME)
    response = model.generate_content([prompt, file_obj])
    raw_text = response.text.strip() if response and response.text else ""
    cleaned = _clean_json_output(raw_text)

    try:
        corrected_json = json.loads(cleaned)
        return corrected_json
    except json.JSONDecodeError as e:
        raise ValueError(
            f"Gemini did not return valid JSON. Error: {e}\nRaw output:\n{raw_text}"
        )


# --- Example Usage ---
# if __name__ == "__main__":
#     # Example: using JSON from previous extract_invoice_json() run
#     previous_json = {
#         "llm_invoices": {
#             "file_path": r"C:\Stealth AI\Clean Reader\data\raw\train\Copy of ARPFIINVOEBTCHLASER (1).jpg",
#             "vendor": "pacificfoodimporters",
#             "date": "08/05/2025",
#             "total": 596.94,
#             "invoice_number": "379183"
#         },
#         "llm_lineitems": [
#             {
#                 "invoice_number": "379183",
#                 "description": "flour power 50 lb graincraft",
#                 "quantity": 8.0,
#                 "unit_price": None,
#                 "total": 192.50
#             },
#             {
#                 "invoice_number": "379183",
#                 "description": "jalapeno sliced 6#10 savor",
#                 "quantity": 1.0,
#                 "unit_price": 51.33,
#                 "total": 51.33
#             },
#             {
#                 "invoice_number": "379183",
#                 "description": "currants 29.2 ea",
#                 "quantity": 1.0,
#                 "unit_price": 29.20,
#                 "total": 29.20
#             },
#             {
#                 "invoice_number": "379183",
#                 "description": "milk oat 12x32oz oatley",
#                 "quantity": 3.0,
#                 "unit_price": 39.95,
#                 "total": 119.84
#             },
#             {
#                 "invoice_number": "379183",
#                 "description": "sugar granulated cane 25 lb ceh sugar",
#                 "quantity": 1.0,
#                 "unit_price": 28.59,
#                 "total": 28.59
#             },
#             {
#                 "invoice_number": "379183",
#                 "description": "eggs liquid 2x95 papettis",
#                 "quantity": 1.0,
#                 "unit_price": 95.23,
#                 "total": 95.23
#             }
#         ],
#         "confidence": 0.93
#     }

#     corrected = visual_extract_invoice_json(previous_json)
#     print(json.dumps(corrected, indent=2))


In [24]:
import os
import csv
import json
import re
from datetime import datetime
from typing import Dict, List, Tuple, Any

# Root 
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
PROCESSED = os.path.join(ROOT, "data", "processed")
REGEX_FOLDER = os.path.join(PROCESSED, "regex")

LLM_FOLDER = os.path.join(PROCESSED, "llm")
os.makedirs(LLM_FOLDER, exist_ok=True)

LLM_INVOICES_PATH = os.path.join(LLM_FOLDER, "llm_invoices.csv")
LLM_LINEITEMS_PATH = os.path.join(LLM_FOLDER, "llm_lineitems.csv")

# Input CSV filenames (inside REGEX_FOLDER)
REGEX_CLEANED_INVOICES_CSV = os.path.join(PROCESSED, "regex_cleaned_invoices.csv")
REGEX_INVOICES_CSV = os.path.join(REGEX_FOLDER, "regex_invoices.csv")        # optional
REGEX_LINEITEMS_CSV = os.path.join(REGEX_FOLDER, "regex_lineitems.csv")

# Validation / business parameters
CONFIDENCE_THRESHOLD = 0.5
# Target date format requested (interpreted as dd-mm-YYYY with dashes)
TARGET_DATE_FORMAT = "%d-%m-%Y"


In [25]:
def read_csv_as_dicts(path: str) -> List[Dict[str, str]]:
    """Return list of rows as dicts from CSV. If file missing, return empty list."""
    if not os.path.exists(path):
        print(f"Warning: file not found: {path}")
        return []
    with open(path, newline='', encoding='utf-8') as fh:
        reader = csv.DictReader(fh)
        return [row for row in reader]

def write_csv_append(path: str, rows: List[Dict[str, Any]], fieldnames: List[str]) -> None:
    """Append rows into CSV; create file and header if doesn't exist."""
    file_exists = os.path.exists(path)
    with open(path, 'a', newline='', encoding='utf-8') as fh:
        writer = csv.DictWriter(fh, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for r in rows:
            writer.writerow({k: ("" if r.get(k) is None else r.get(k)) for k in fieldnames})

def normalize_date(value: str) -> Tuple[Any, str]:
    """
    Try to parse a date string and return (parsed_date_str_or_None, error_message).
    Target format is TARGET_DATE_FORMAT (dd-mm-YYYY).
    """
    if value is None:
        return None, "missing"
    v = str(value).strip()
    if not v:
        return None, "empty string"
    # Common separators: '/', '-', '.', whitespace. Replace to uniform then try several formats.
    v_clean = re.sub(r'[\\/\.]', '-', v)
    # possible input formats
    candidates = [
        "%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y",
        "%Y-%m-%d", "%Y-%d-%m", "%d %b %Y", "%d %B %Y",
        "%m/%d/%Y", "%d/%m/%Y"
    ]
    for fmt in candidates:
        try:
            dt = datetime.strptime(v_clean, fmt)
            return dt.strftime(TARGET_DATE_FORMAT), ""
        except Exception:
            continue
    # try to extract digits (e.g., 08/05/2025 -> 08-05-2025)
    digits = re.findall(r'\d+', v)
    if len(digits) >= 3:
        try:
            # use last three as month/day/year or day/month/year heuristics
            d0, d1, d2 = digits[-3], digits[-2], digits[-1]
            # sensible guess: if first > 12 then it is day
            if int(d0) > 12:
                dt = datetime(int(d2), int(d1), int(d0))
            else:
                dt = datetime(int(d2), int(d0), int(d1))
            return dt.strftime(TARGET_DATE_FORMAT), ""
        except Exception:
            pass
    return None, f"unparseable date: {value}"

def to_float_safe(value) -> Tuple[Any, str]:
    """Convert to float if possible. Returns (float_or_None, error_reason)."""
    if value is None:
        return None, "missing"
    try:
        if isinstance(value, (float, int)):
            return float(value), ""
        s = str(value).strip().replace(',', '').replace('$', '')
        if s == "":
            return None, "empty"
        return float(s), ""
    except Exception as e:
        return None, f"not_a_number ({value})"

def to_str_safe(value) -> Tuple[Any, str]:
    if value is None:
        return None, "missing"
    s = str(value).strip()
    if s == "":
        return None, "empty"
    return s, ""

def validate_invoice_json(result_json: Dict[str, Any]) -> Tuple[bool, List[str]]:
    """
    Validate the JSON according to your rules.
    Return (is_valid, list_of_issues). If invalid, list of human-readable issues.
    """
    issues = []
    # top-level checks
    inv = result_json.get("llm_invoices")
    if not inv:
        issues.append("llm_invoices missing")
        return False, issues

    # file_path check
    fp = inv.get("file_path")
    fp_s, fp_err = to_str_safe(fp)
    if fp_err:
        issues.append(f"file_path: {fp_err}")
    else:
        # ensure path matches existing file
        if not os.path.exists(fp_s):
            issues.append(f"file_path does not exist on disk: {fp_s}")

    # vendor
    vendor_s, vendor_err = to_str_safe(inv.get("vendor"))
    if vendor_err:
        issues.append(f"vendor: {vendor_err}")

    # date
    date_val = inv.get("date")
    date_norm, date_err = normalize_date(date_val)
    if date_err:
        issues.append(f"date: {date_err}")
    else:
        inv["date"] = date_norm  # overwrite with normalized format

    # total
    total_val, total_err = to_float_safe(inv.get("total"))
    if total_err:
        issues.append(f"total: {total_err}")
    else:
        inv["total"] = total_val

    # invoice_number -> must be a string
    invnum, invnum_err = to_str_safe(inv.get("invoice_number"))
    if invnum_err:
        issues.append(f"invoice_number: {invnum_err}")
    else:
        inv["invoice_number"] = invnum

    # confidence
    conf = result_json.get("confidence")
    conf_val, conf_err = to_float_safe(conf)
    if conf_err:
        issues.append(f"confidence: {conf_err}")
    else:
        result_json["confidence"] = conf_val

    # line items
    lineitems = result_json.get("llm_lineitems")
    if not isinstance(lineitems, list) or len(lineitems) == 0:
        issues.append("llm_lineitems missing or empty")
    else:
        for idx, li in enumerate(lineitems):
            # description
            desc, desc_err = to_str_safe(li.get("description"))
            if desc_err:
                # allow empty description to be flagged but continue
                issues.append(f"lineitem[{idx}].description: {desc_err}")
            else:
                li["description"] = desc
            # quantity
            qty, qty_err = to_float_safe(li.get("quantity"))
            if qty_err:
                # quantity may be null for some items, we'll flag it
                issues.append(f"lineitem[{idx}].quantity: {qty_err}")
            else:
                li["quantity"] = qty
            # unit_price
            up, up_err = to_float_safe(li.get("unit_price"))
            if up_err:
                issues.append(f"lineitem[{idx}].unit_price: {up_err}")
            else:
                li["unit_price"] = up
            # total
            tval, t_err = to_float_safe(li.get("total"))
            if t_err:
                issues.append(f"lineitem[{idx}].total: {t_err}")
            else:
                li["total"] = tval

            # basic arithmetic check (if both qty and unit_price present)
            if isinstance(li.get("quantity"), (float, int)) and isinstance(li.get("unit_price"), (float, int)):
                calc = round(li["quantity"] * li["unit_price"], 2)
                if li.get("total") is not None:
                    # allow small rounding differences up to 0.05
                    if abs(calc - li["total"]) > 0.05:
                        issues.append(f"lineitem[{idx}] calc mismatch: qty*unit_price={calc} != total {li['total']}")

    # final validation decision
    is_valid = (len(issues) == 0) and (result_json.get("confidence", 0) >= CONFIDENCE_THRESHOLD)
    return is_valid, issues


In [26]:
# formatting helpers to prepare inputs for extract_invoice_json

def build_inputs_for_extractor(cleaned_row: Dict[str, str], lineitems_rows: List[Dict[str, str]]) -> Tuple[str, List[Dict], List[Dict]]:
    """
    Build (clean_text, invoice_data, lineitems_data) in the format expected by extract_invoice_json.
    `cleaned_row` is one row from regex_cleaned_invoices.csv (contains at least file_path and clean_text).
    `lineitems_rows` are the rows from regex_lineitems.csv that match the file_path.
    """
    clean_text = cleaned_row.get("cleaned_text") or cleaned_row.get("clean_text") or cleaned_row.get("clean text") or ""
    file_path = cleaned_row.get("file_path") or cleaned_row.get("filepath") or cleaned_row.get("FilePath")
    invoice_id = cleaned_row.get("id") or cleaned_row.get("invoice_id") or cleaned_row.get("invoice number") or cleaned_row.get("invoice_number")
    invoice_number = cleaned_row.get("invoice_number") or cleaned_row.get("invoice number") or invoice_id
    vendor = cleaned_row.get("vendor") or ""
    date = cleaned_row.get("date") or ""
    total = cleaned_row.get("total") or ""

    # invoice_data expects a list with one dict
    invoice_data = [{
        "file_path": file_path,
        "invoice_id": int(invoice_id) if invoice_id and str(invoice_id).isdigit() else invoice_id,
        "vendor": vendor,
        "date": date,
        "total": total,
        "invoice_number": str(invoice_number) if invoice_number is not None else None
    }]

    # build lineitems_data list
    lineitems_data = []
    for row in lineitems_rows:
        # use a best-effort mapping
        desc = row.get("description") or row.get("desc") or row.get("Description") or ""
        qty = row.get("quantity") or row.get("qty") or row.get("Quantity")
        unit_price = row.get("unit_price") or row.get("unitprice") or row.get("price per") or row.get("unit price")
        total_li = row.get("total") or row.get("amount") or row.get("line_total")

        # normalize numeric obvious empty markers
        qty_val = None
        try:
            qty_val = float(str(qty).strip()) if qty not in (None, "") else None
        except Exception:
            qty_val = None

        up_val = None
        try:
            up_val = float(str(unit_price).replace(',', '').strip()) if unit_price not in (None, "", "0.0") else (0.0 if str(unit_price).strip() in ("0", "0.0") else None)
        except Exception:
            up_val = None

        total_val = None
        try:
            total_val = float(str(total_li).replace(',', '').strip()) if total_li not in (None, "") else None
        except Exception:
            total_val = None

        lineitems_data.append({
            "description": desc,
            "quantity": qty_val,
            "unit_price": up_val,
            "total": total_val
        })

    return clean_text, invoice_data, lineitems_data


In [27]:
# main processing pipeline

def process_all_regex_invoices():
    cleaned_invoices = read_csv_as_dicts(REGEX_CLEANED_INVOICES_CSV)
    if not cleaned_invoices:
        print("No cleaned invoices found. Exiting.")
        return

    regex_lineitems = read_csv_as_dicts(REGEX_LINEITEMS_CSV)
    # index lineitems by file_path for quick lookup
    lineitems_index = {}
    for row in regex_lineitems:
        fp = row.get("file_path") or row.get("filepath") or row.get("FilePath")
        if not fp:
            continue
        lineitems_index.setdefault(fp, []).append(row)

    # prepare collectors for CSV output
    invoices_to_append = []
    lineitems_to_append = []

    for idx, c_row in enumerate(cleaned_invoices, start=1):
        print(f"\nProcessing [{idx}/{len(cleaned_invoices)}] file_path: {c_row.get('file_path')}")
        file_path = c_row.get("file_path")
        matching_lineitems = lineitems_index.get(file_path, [])
        clean_text, invoice_data, lineitems_data = build_inputs_for_extractor(c_row, matching_lineitems)

        # call text-only extractor (assumes extract_invoice_json is defined)
        try:
            result_json = extract_invoice_json(clean_text, invoice_data, lineitems_data)
        except Exception as e:
            print(f"extract_invoice_json raised an error: {e}")
            # If extract fails completely, fallback to visual directly (create minimal JSON)
            result_json = {
                "llm_invoices": {
                    "file_path": file_path,
                    "vendor": None,
                    "date": None,
                    "total": None,
                    "invoice_number": None
                },
                "llm_lineitems": [],
                "confidence": 0.0
            }

        # Validate
        is_valid, issues = validate_invoice_json(result_json)

        went_visual = False
        if (not is_valid) or (result_json.get("confidence", 0) < CONFIDENCE_THRESHOLD):
            # Log the reasons
            print("Validation failed or low confidence. Issues:")
            for it in issues:
                print("  -", it)
            print("Calling visual_extract_invoice_json to attempt correction...")
            went_visual = True
            try:
                corrected = visual_extract_invoice_json(result_json)
                print("Visual correction returned. Re-validating...")
                # replace result_json with corrected
                result_json = corrected
                is_valid_after, issues_after = validate_invoice_json(result_json)
                if not is_valid_after:
                    print("Still some issues after visual correction:")
                    for it in issues_after:
                        print("  -", it)
                else:
                    print("Visual correction successful.")
                is_valid = is_valid_after
                issues = issues_after
            except Exception as e:
                print("visual_extract_invoice_json raised an error:", e)
                # keep original result_json and mark as failed
                is_valid = False

        else:
            print(f"Text-only extraction OK. Confidence = {result_json.get('confidence', None)}")

        # Prepare rows to append into LLM CSVs (even if partially invalid, append corrected/available fields)
        inv = result_json.get("llm_invoices", {})
        llm_inv_row = {
            "file_path": inv.get("file_path"),
            "invoice_id": inv.get("invoice_number") or inv.get("invoice_id") or "",
            "vendor": inv.get("vendor"),
            "date": inv.get("date"),
            "total": inv.get("total"),
            "invoice_number": inv.get("invoice_number")
        }
        invoices_to_append.append(llm_inv_row)

        # line items
        for li in result_json.get("llm_lineitems", []):
            li_row = {
                "file_path": inv.get("file_path"),
                "invoice_id": inv.get("invoice_number"),
                "description": li.get("description"),
                "quantity": li.get("quantity"),
                "unit_price": li.get("unit_price"),
                "total": li.get("total")
            }
            lineitems_to_append.append(li_row)

        # Print summary for this invoice
        status = "visual" if went_visual else "text-only"
        print(f"Finished: {file_path} — method={status} — valid={is_valid} — confidence={result_json.get('confidence')}")

    # flush to CSVs
    invoice_fieldnames = ["file_path", "invoice_id", "vendor", "date", "total", "invoice_number"]
    lineitem_fieldnames = ["file_path", "invoice_id", "description", "quantity", "unit_price", "total"]

    if invoices_to_append:
        write_csv_append(LLM_INVOICES_PATH, invoices_to_append, invoice_fieldnames)
        print(f"Appended {len(invoices_to_append)} rows to {LLM_INVOICES_PATH}")
    if lineitems_to_append:
        write_csv_append(LLM_LINEITEMS_PATH, lineitems_to_append, lineitem_fieldnames)
        print(f"Appended {len(lineitems_to_append)} rows to {LLM_LINEITEMS_PATH}")

    print("Processing complete.")


In [28]:
# run
if __name__ == "__main__":
    process_all_regex_invoices()



Processing [1/57] file_path: C:\Stealth AI\Clean Reader\data\raw\train\Copy of ARPFIINVOEBTCHLASER (1).jpg
Text-only extraction OK. Confidence = 0.9
Finished: C:\Stealth AI\Clean Reader\data\raw\train\Copy of ARPFIINVOEBTCHLASER (1).jpg — method=text-only — valid=True — confidence=0.9

Processing [2/57] file_path: C:\Stealth AI\Clean Reader\data\raw\train\Copy of ARPFIINVOEBTCHLASER (1).pdf
Text-only extraction OK. Confidence = 0.98
Finished: C:\Stealth AI\Clean Reader\data\raw\train\Copy of ARPFIINVOEBTCHLASER (1).pdf — method=text-only — valid=True — confidence=0.98

Processing [3/57] file_path: C:\Stealth AI\Clean Reader\data\raw\train\Copy of ARPFIINVOEBTCHLASER (10)-page2.jpg
Validation failed or low confidence. Issues:
  - total: missing
  - lineitem[0].unit_price: missing
  - lineitem[0].total: missing
Calling visual_extract_invoice_json to attempt correction...
Visual correction returned. Re-validating...
Still some issues after visual correction:
  - total: missing
Finished: 

In [30]:
import re
import wordninja

# global runtime memory for vendors
known_vendors = set()

def normalize_vendor_name(vendor: str, known_vendors_list=None):
    """
    Normalize and learn vendor names automatically.

    Steps:
    1. Remove punctuation and suffixes (Inc, LLC, etc.)
    2. Check if a cleaned vendor matches any known vendor
    3. If not found, use wordninja to split concatenated words
    4. Add the newly normalized vendor to the known vendor list
    """

    # Step 0 – guard clause
    if not vendor:
        return ""

    # Initialize if not passed
    if known_vendors_list is None:
        known_vendors_list = known_vendors  # uses global set

    # Step 1 – basic cleaning
    vendor_clean = vendor.strip()
    vendor_clean = re.sub(r"[^A-Za-z0-9]", "", vendor_clean)
    vendor_lower = vendor_clean.lower()

    # Step 2 – remove suffixes
    suffixes = ["inc", "incorporated", "ltd", "llc", "company", "corp", "co"]
    for s in suffixes:
        if vendor_lower.endswith(s):
            vendor_lower = vendor_lower[: -len(s)]

    vendor_lower = vendor_lower.strip()

    # Step 3 – try known vendors first
    for known in known_vendors_list:
        if known.lower().replace(" ", "") in vendor_lower:
            return known  # found canonical form

    # Step 4 – use wordninja to segment new vendors
    words = wordninja.split(vendor_lower)
    cleaned = " ".join(words).title()

    # Step 5 – store learned vendor name
    known_vendors_list.add(cleaned)

    return cleaned


In [31]:
import pandas as pd
import os

# --- Paths ---
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
PROCESSED = os.path.join(ROOT, "data", "processed")
LLM_FOLDER = os.path.join(PROCESSED, "llm")
LLM_INVOICES_PATH = os.path.join(LLM_FOLDER, "llm_invoices.csv")

# --- Load CSV ---
if not os.path.exists(LLM_INVOICES_PATH):
    raise FileNotFoundError(f"File not found: {LLM_INVOICES_PATH}")

df = pd.read_csv(LLM_INVOICES_PATH)

if "vendor" not in df.columns:
    raise KeyError("The CSV does not contain a 'vendor' column.")

# --- Normalize vendor names ---
print("Normalizing vendor names...")
df["vendor"] = df["vendor"].astype(str).apply(normalize_vendor_name)

# --- Save updated file ---
df.to_csv(LLM_INVOICES_PATH, index=False)

print(f"✅ Vendor names normalized and updated in {LLM_INVOICES_PATH}")


Normalizing vendor names...
✅ Vendor names normalized and updated in c:\Stealth AI\Clean Reader\data\processed\llm\llm_invoices.csv


In [38]:
import pandas as pd
import os

# --- Paths ---
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
PROCESSED = os.path.join(ROOT, "data", "processed")
LLM_FOLDER = os.path.join(PROCESSED, "llm")
LLM_INVOICES_PATH = os.path.join(LLM_FOLDER, "llm_invoices.csv")

# --- Load CSV ---
if not os.path.exists(LLM_INVOICES_PATH):
    raise FileNotFoundError(f"File not found: {LLM_INVOICES_PATH}")

df = pd.read_csv(LLM_INVOICES_PATH)
df.columns = [c.strip().lower() for c in df.columns]

if "vendor" not in df.columns:
    raise KeyError(f"'vendor' column not found. Found columns: {df.columns.tolist()}")

# --- Count unique vendors ---
vendor_counts = (
    df["vendor"]
    .astype(str)
    .str.strip()
    .value_counts()
    .reset_index()
)

# figure out which column is the vendor name vs count
col_names = list(vendor_counts.columns)
vendor_col = col_names[0]
count_col = col_names[1]

# --- Display ---
print("Unique Vendors Found:\n")
for _, row in vendor_counts.iterrows():
    vendor_name = row[vendor_col]
    count_value = row[count_col]
    print(f"{vendor_name:<40} — {count_value} invoices")

print(f"\nTotal unique vendors: {len(vendor_counts)}")


Unique Vendors Found:

Pacific Food Importers                   — 35 invoices
Franks Quality Produce                   — 22 invoices

Total unique vendors: 2
