In [28]:
import pytesseract
from pdf2image import convert_from_path
import re
from fuzzywuzzy import process

# Path to Tesseract-OCR (update this based on system configuration)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Predefined vendor names for fuzzy matching
VENDOR_LIST = ["ABC Supplies Ltd.", "XYZ Traders Inc.", "Global Tech Solutions", "Fast Retail Corp."]

# Function to extract text from invoice PDF
def extract_text_from_invoice(pdf_path):
    # Convert PDF to high-quality images
    images = convert_from_path(
        pdf_path,
        poppler_path=r'C:\Program Files\poppler-24.08.0\Library\bin',
        dpi=300  # Higher DPI for better quality
    )
    
    extracted_text = ""
    for img in images:
        # Enhance image quality before OCR
        img = img.convert('L')  # Convert to grayscale
        extracted_text += pytesseract.image_to_string(
            img,
            lang='eng',
            config='--psm 6 --oem 3'  # PSM 6: Assume a single uniform block of text; OEM 3: Default LSTM engine
        )
    return extracted_text

# Function to clean and parse the extracted text
def parse_invoice_text(text):
    # Fuzzy match vendor name
    vendor_match = process.extractOne(text.split('\n')[0], VENDOR_LIST)
    vendor = vendor_match[0] if vendor_match[0] in VENDOR_LIST else "Unknown Vendor"
    
    # Extract invoice details using regex
    invoice_no = re.search(r'Invoice No: (\S+)', text).group(1)
    date = re.search(r'Date: (\S+)', text).group(1)
    
    # Extract itemized section
    lines = text.split('\n')
    item_section_start = lines.index("Item Qty Price Total") + 2  # Skip header and separator line
    items = []
    for line in lines[item_section_start:]:
        if not line.strip():
            break
        parts = line.split()
        if len(parts) == 4:
            item, qty, price, total = parts
            items.append({
                "Item": item,
                "Qty": int(qty),
                "Price": float(price),
                "Total": float(total)
            })
    
    # Extract total amount
    total_amount = float(re.search(r'Total Amount: \$(\d+\.\d+)', text).group(1))
    
    return {
        "Vendor": vendor,
        "Invoice No": invoice_no,
        "Date": date,
        "Items": items,
        "Total Amount": total_amount
    }

# Example usage
pdf_path = "E:\Shoban-NCI\VS_Code_WS\IAPA\sample_invoice.pdf"  # Replace with actual invoice file path
text = extract_text_from_invoice(pdf_path)
# Parse the extracted text
invoice_data = parse_invoice_text(text)
print("\nParsed Invoice Data:")
print(invoice_data)


Parsed Invoice Data:
{'Vendor': 'XYZ Traders Inc.', 'Invoice No': 'INV123456', 'Date': '03/29/2024', 'Items': [{'Item': 'Mouse', 'Qty': 2, 'Price': 25.0, 'Total': 50.0}, {'Item': 'Keyboard', 'Qty': 1, 'Price': 45.0, 'Total': 45.0}], 'Total Amount': 1295.0}
