In [1]:
import easyocr
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import torch
import json
import re
import faiss
import numpy as np


2025-02-23 17:57:16.258139: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-23 17:57:16.271136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740313636.286742 3161882 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740313636.291294 3161882 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-23 17:57:16.309081: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# === Step 1: OCR - Extract Text from Invoice Image ===
def extract_text_from_image(image_path):
    reader = easyocr.Reader(['en'])
    ocr_results = reader.readtext(image_path)
    ocr_text = "\n".join([result[1] for result in ocr_results])
    return ocr_text

In [3]:
# === Step 2: Process OCR Text with Mistral-7B to Extract Invoice Details ===
def extract_invoice_details(ocr_text):
    model_name = "mistralai/Mistral-7B-Instruct-v0.3"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, device_map="auto"
    )
    
    prompt = f"""
    Extract key details from the following invoice text and return only valid JSON.

    <invoice>
    {ocr_text}
    </invoice>

    Return only JSON output:
    """
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=4096,
            temperature=0,
            eos_token_id=tokenizer.eos_token_id
        )

    extracted_text = tokenizer.decode(output[0], skip_special_tokens=True)

    json_match = re.search(r"\{.*\}", extracted_text, re.DOTALL)
    if json_match:
        try:
            extracted_data = json.loads(json_match.group(0))
            return extracted_data
        except json.JSONDecodeError:
            print("❌ Error: Could not parse JSON")
            return None
    else:
        print("❌ Error: No valid JSON found")
        return None


In [4]:
# === Step 3: Initialize FAISS with Sentence Embeddings ===
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient text embedding model
d = 384  # Embedding size for this model
index = faiss.IndexFlatL2(d)  # L2 distance index
invoice_store = {}
field_index = faiss.IndexFlatL2(d)  # For field-level embeddings
field_store = {}


In [5]:
# === Step 4: Store Invoices in FAISS ===
def embed_and_store_invoice(invoice_json):
    invoice_text = json.dumps(invoice_json)
    invoice_vector = embed_model.encode(invoice_text).reshape(1, -1).astype('float32')
    index.add(invoice_vector)

    stored_index = index.ntotal - 1
    invoice_store[stored_index] = invoice_json  # Store dictionary (not string)

    # Store field-level embeddings
    for key, value in invoice_json.items():
        field_text = f"{key}: {value}"
        field_vector = embed_model.encode(field_text).reshape(1, -1).astype('float32')
        field_index.add(field_vector)
        field_store[field_index.ntotal - 1] = {"key": key, "value": value}

    return stored_index

In [6]:
# === Step 5: Generalized Querying ===
def query_invoice(query_text):
    query_vector = embed_model.encode(query_text).reshape(1, -1).astype('float32')
    _, indices = field_index.search(query_vector, 1)

    retrieved_index = indices[0][0]
    if retrieved_index in field_store:
        return field_store[retrieved_index]["value"]

    return "❌ No relevant invoices found."

In [7]:
# === Step 6: Full Processing Pipeline ===
def process_invoice(image_path):
    ocr_text = extract_text_from_image(image_path)
    invoice_details = extract_invoice_details(ocr_text)

    if invoice_details:
        assigned_index = embed_and_store_invoice(invoice_details)
        print(f"✅ Invoice stored successfully at index {assigned_index}!")
        return assigned_index
    else:
        print("❌ Failed to process invoice.")
        return None

In [8]:
# === Step 7: Example Usage ===
image_path = "invoice.png"
invoice_index = process_invoice(image_path)

if invoice_index is not None:
    query = "What is the total amount?"
    query_result = query_invoice(query)
    print("🔹 Query Result:", query_result)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


✅ Invoice stored successfully at index 0!
🔹 Query Result: 154.06


In [9]:
if invoice_index is not None:
    query = "the bill belongs to whom?"
    query_result = query_invoice(query)
    print("🔹 Query Result:", query_result)

🔹 Query Result: {'name': 'John Smith', 'address': '2 Court Square', 'city': 'New York', 'state': 'NY', 'zipCode': '12210'}


In [10]:
if invoice_index is not None:
    query = "what is the invoice date?"
    query_result = query_invoice(query)
    print("🔹 Query Result:", query_result)

🔹 Query Result: 11/02/2019
