In [1]:
from langchain_docling import DoclingLoader

FILE_PATH = "tax-invoice.jpg"

loader = DoclingLoader(file_path=FILE_PATH)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs = loader.load()



In [None]:
from pathlib import Path
from tempfile import mkdtemp
import os

from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_docling.loader import ExportType

from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer


from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM

load_dotenv()
        
HF_TOKEN = os.getenv("HF_TOKEN")
FILE_PATH = ["tax-invoice.jpg"]  # Docling Technical Report
# EMBED_MODEL_ID = OllamaEmbeddings(model="nomic-embed-text")
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"

GEN_MODEL_ID = "mistral"
EXPORT_TYPE = ExportType.DOC_CHUNKS
QUESTION = "What is the Invoice NO.?"
PROMPT = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {input}\nAnswer:\n",
)
TOP_K = 3
# MILVUS_URI = str(Path(mkdtemp()) / "docling.db")

In [49]:
from docling.chunking import HybridChunker
from langchain_docling import DoclingLoader

loader = DoclingLoader(
    file_path=FILE_PATH,
    export_type=EXPORT_TYPE,
    chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
)

docs = loader.load()



In [50]:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
    splits = docs
elif EXPORT_TYPE == ExportType.MARKDOWN:
    from langchain_text_splitters import MarkdownHeaderTextSplitter

    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[
            ("#", "Header_1"),
            ("##", "Header_2"),
            ("###", "Header_3"),
        ],
    )
    splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]
else:
    raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")

In [51]:
for d in splits:
    print(f"- {d.page_content=}")
# print("...")

- d.page_content="Tax Invoice\nMax Enterprises\nInvoice No.\nDated\n261,6th Cross Jayanagar,4th Block Bengaluru\n4-Apr-20\nDelivery Note\nModelTerms of Payment\nGSTINIUIN: 29AAACP7879DIZ\nReference No. & Date\nOther References\nState Name\nKarnataka Code\n29\nE-Mail\nsupport@maxenterprises com\nBuyer's Order No.\nDated\nBuyer (Bill to)\nAce Electronics 345,7th Cross Koramangala Benguuru GSTINUIN State Name\nDispatch Doc No.\nDelivery Note Date\nRWIOO1\nDispatched through\nDestination\nRoad\nBengaluru_\n29AAACE7858FIZC\nBill of Lading/LR-RR No.\nMotor Vehicle No\nKarnataka\n29\nKA 51 EA 5451\nTerms of Delivery\n, Description of Goods = Dell 17 inch Monitor. , HSNISAC = 8471. , Quantity = 5 Nos. , Rate = 8,900.00. , per = Nos. , Amount = 44,500.00. , Description of Goods = Total. , HSNISAC = . , Quantity = 5 Nos. , Rate = . , per = . , Amount = { 52,510.00\nAmount Chargeable (in words)"
- d.page_content='INR Two Thousand Five Hundred Ten Only Fifty\n8471, Taxable Value. = 44,500.00. 8471

In [52]:
import json
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter


embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)
vectorstore = FAISS.from_texts(
    [chunk.page_content for chunk in docs],
    embeddings,
)


In [53]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_huggingface import HuggingFaceEndpoint

retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
# llm = HuggingFaceEndpoint(
#     repo_id=GEN_MODEL_ID,
#     huggingfacehub_api_token=HF_TOKEN,
#     task="text-generation",

llm = OllamaLLM(model=GEN_MODEL_ID, temperature=0.5)


In [54]:
def clip_text(text, threshold=100):
    return f"{text[:threshold]}..." if len(text) > threshold else text

In [55]:
question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
resp_dict = rag_chain.invoke({"input": QUESTION})

clipped_answer = clip_text(resp_dict["answer"], threshold=350)
print(f"Question:\n{resp_dict['input']}\n\nAnswer:\n{clipped_answer}")
for i, doc in enumerate(resp_dict["context"]):
    print()
    print(f"Source {i+1}:")
    print(f"  text: {json.dumps(clip_text(doc.page_content, threshold=350))}")
    for key in doc.metadata:
        if key != "pk":
            val = doc.metadata.get(key)
            clipped_val = clip_text(val) if isinstance(val, str) else val
            print(f"  {key}: {clipped_val}")

Question:
What is the Invoice NO.?

Answer:
 The Invoice No. is not explicitly mentioned in the provided context. However, it's common for invoice numbers to be included in the header or footer of an invoice document. To find the exact invoice number, you should refer to the original document or confirm with the sender if possible.

Source 1:
  text: "Tax Amount (in words) INR Eight Thousand Ten Only\nDeclaration\nfor Max Enterprises\nWe declare that this invoice shows the actual price of the goods described and that all particulars are true and correct\nAuthorised Signatory"

Source 2:
  text: "Tax Invoice\nMax Enterprises\nInvoice No.\nDated\n261,6th Cross Jayanagar,4th Block Bengaluru\n4-Apr-20\nDelivery Note\nModelTerms of Payment\nGSTINIUIN: 29AAACP7879DIZ\nReference No. & Date\nOther References\nState Name\nKarnataka Code\n29\nE-Mail\nsupport@maxenterprises com\nBuyer's Order No.\nDated\nBuyer (Bill to)\nAce Electronics 345,7th Cross Koramangala Benguu..."

Source 3:
  text: "INR