Docling RAG with Langchain on Colab

Partha Pratim Ray, https://github.com/ParthaPRay

Reference: https://ds4sd.github.io/docling/examples/rag_langchain/

In [1]:
# requirements for this example:
%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.1/113.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.7/90.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.4/22.4 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# For local machine only
# import os

# from dotenv import load_dotenv

# load_dotenv()


False

In [2]:
# For colab only

# Firstly, Save "HF_TOKEN" HuggingFace TOKEN into Colab Secrets

# Then with Notebook Access

from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

Loader and Splitter

In [3]:
from typing import Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document

            #text = dl_doc.export_to_markdown() # Markdown

            text = dl_doc.export_to_markdown(strict_text=True)  # Text

            #text = dl_doc.export_to_document_tokens() # Doctags

            ############ JSON
            #import json
            #text = json.dumps(dl_doc.export_to_dict()) # JSON

            ########### YAML
            #import yaml
            #text = yaml.safe_dump(dl_doc.document.export_to_dict())  #YAML

            yield LCDocument(page_content=text)

Document Path

Single or Multiple Documents

In [4]:
FILE_PATH = ["https://arxiv.org/pdf/2408.09869","https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf"]  # Docling Technical Report

Text Splitter

CHange Chunk suze and chunk overlap

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

Text Splitting

In [6]:
docs = loader.load()
splits = text_splitter.split_documents(docs)

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


Embeddings from HuggingFace Models

In [7]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

Vector Store Milvus from Langchain

In [10]:
import os

from tempfile import TemporaryDirectory

from langchain_milvus import Milvus

MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
)

LLM from HuggingFace Models

In [11]:
from langchain_huggingface import HuggingFaceEndpoint

### Use Locally
# HF_API_KEY = os.environ.get("HF_TOKEN")

##### For colab only
from google.colab import userdata
HF_API_KEY=userdata.get('HF_TOKEN')
#######

HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

RAG Implementation

In [12]:
from typing import Iterable

from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()

prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Question to RAG Based on the Document

In [13]:
rag_chain.invoke("Does Docling implements a linear pipeline of operations?") # Docling paper first pdf

'Yes, Docling implements a linear pipeline of operations, where each operation executes sequentially on a given document. The pipeline consists of the following stages: document parsing, standard model pipeline, and output assembly. Each stage performs specific tasks on the document and passes the result to the next stage. The standard model pipeline can be customized by sub-classing from an abstract base class or cloning the default model pipeline, allowing for extension of the capabilities of Docling.'

In [14]:
rag_chain.invoke("How many pages were human annotated for DocLayNet?") #Docling paper second pdf

'The number of pages that were human annotated for DocLayNet is not explicitly stated in the provided context, but it can be inferred that the dataset contains 80863 unique document pages. Among these, there are 7059 pages with two instances of human annotations, and 1591 pages with three. This amounts to a total of 91104 annotation instances, which implies that a fraction of the pages have been human annotated. However, the exact number of pages that have been human annotated cannot be determined with certainty from the provided context.'

---