In [6]:
import os 
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

try: 
    load_dotenv()
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    print("Environment variables loaded successfully.")
except Exception as e:
    print(f"Error loading environment variables: {e}")
    
model = ChatOpenAI(model="gpt-4o-mini", temperature=0.8)

Environment variables loaded successfully.


In [23]:
from pypdf import PdfReader
from unstructured.partition.pdf import partition_pdf
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from unstructured.partition.html import partition_html
from unstructured.partition.md import partition_md


class ExtractData:
    
    def extract_text_from_pdf(self, pdf_path):
        try:
            elements = partition_pdf(
            filename=pdf_path,                  # mandatory
            strategy="hi_res",                                     # mandatory to use ``hi_res`` strategy
            extract_images_in_pdf=True,                            # mandatory to set as ``True``
            extract_image_block_types=["Image", "Table"],          # optional
            extract_image_block_to_payload=True,    
            infer_table_structure=True,# optional
            extract_image_block_output_dir="images/",  # optional - only works when ``extract_image_block_to_payload=False``
            languages=["eng"],                           # optional
            )

            tables, images, texts = self.parse_elements(elements)
            tables_summary = self.create_summaries_of_tables(tables)
            texts_summary = self.create_summaries_of_texts(texts)
            images_summary = self.create_summaries_of_images(images)
            return tables_summary, images_summary, texts_summary

        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return [], [], []
    
    def extract_text_from_html(self, html_path):
        try:
            # From filename
            elements = partition_html(filename=html_path)

            # From file object
            with open(html_path, "r") as f:
                elements = partition_html(file=f)

            # From text string
            with open(html_path, "r") as f:
                text = f.read()
            elements = partition_html(text=text)

            tables, images, texts = self.parse_elements(elements)
        
        except Exception as e:
            print(f"Error in Extraction of HTML : {str(e)}")
            return [], [], []
                
        
    
    def extract_text_from_markdown(self, md_path):
        try:
            # From filename
            elements = partition_md(filename=md_path)

            # # From file object
            with open(md_path, "r") as f:
                elements = partition_md(file=f)

            # # From text string
            with open(md_path, "r") as f:
                text = f.read()
            elements = partition_md(text=text)
            tables, images, texts = self.parse_elements(elements)
            return tables, images, texts
        except Exception as e:
            print(f"Error in Extraction of Markdown : {str(e)}")
            return [], [], []
    
    def parse_elements(self, elements):
        try:
            tables = []
            images = []
            texts = []

            # Save image and table elements
            for i, element in enumerate(elements):
                # print(f"{i}. Type: {element.category}")
                
                if element.category == "Image":
                    #print(element.metadata.image_base64)
                    images.append(element.metadata.image_base64)
                
                elif element.category == "Table":
                    #print("Table HTML:")
                    tables.append(element.metadata.text_as_html)
                else:
                    texts.append(element.text)
            
            return tables, images, texts
        
        except Exception as e:
            print(f"Error Parsing Elements: {e}")
            return [], [], []
        
    
    def create_summaries_of_tables(self, tables):
        # Prompt
        prompt_text = """You are an assistant tasked with summarizing tables. \
        Give a concise summary of the table. Table chunk: {element} """
        prompt = ChatPromptTemplate.from_template(prompt_text)

        # Summary chain
        summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
        return table_summaries

    def create_summaries_of_texts(self, texts):
        # Prompt
        prompt_text = """You are an assistant tasked with summarizing texts. \
        Give a concise summary of the text. Text chunk: {element} """
        prompt = ChatPromptTemplate.from_template(prompt_text)

        # Summary chain
        summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
        return text_summaries

    def create_summaries_of_images(self, images):
        prompt_template = """You are an assistant tasked with summarizing images for retrieval.
                Remember these images could potentially contain graphs, charts or 
                tables also.
                These summaries will be embedded and used to retrieve the raw image 
                for question answering.
                Give a detailed summary of the image that is well optimized for 
                retrieval.
                Do not add additional words like Summary: etc.
             """
        messages = [
            (
                "user",
                [
                    {"type": "text", "text": prompt_template},
                    {
                        "type": "image_url",
                        "image_url": {"url": "data:image/jpeg;base64,{image}"},
                    },
                ],
            )
        ]

        prompt = ChatPromptTemplate.from_messages(messages)

        chain = prompt | model | StrOutputParser()

        image_summaries = chain.batch(images)
        return image_summaries

In [24]:
tables, images, text = ExtractData().extract_text_from_pdf("data/aadhar.pdf")

In [None]:
print(len(tables))

In [None]:
print(len(images))

In [None]:
print(text)

In [None]:
print(ExtractData().create_summaries_of_tables(tables))

In [None]:
gen_image_summaries = ExtractData().create_summaries_of_images(images)

In [None]:
for i in gen_image_summaries:
    print(i)

In [None]:
from unstructured.partition.html import partition_html

# From filename
elements = partition_html(filename="data/upi.html")

# From file object
with open("data/upi.html", "r") as f:
    elements = partition_html(file=f)

# From text string
with open("data/upi.html", "r") as f:
    text = f.read()
elements = partition_html(text=text)

html_tables = []
html_images = []
html_texts = []

# Save image and table elements
for i, element in enumerate(elements):
    # print(f"{i}. Type: {element.category}")
    
    if element.category == "Image":
        # print(element.metadata.to_dict())
        html_images.append(element.metadata.image_url)

    elif element.category == "Table":
        #print("Table HTML:")
        html_tables.append(element.metadata.text_as_html)
    else:
        html_texts.append(element.text)
        
print(html_tables)
print(html_texts)
print(html_images)



In [None]:
from unstructured.partition.md import partition_md

# From filename
elements = partition_md(filename="data/README.md")

# From file object
with open("data/README.md", "r") as f:
    elements = partition_md(file=f)

# From text string
with open("data/README.md", "r") as f:
    text = f.read()
elements = partition_md(text=text)
print(elements)
#pip install "unstructured[md]"

In [None]:
md_tables, md_images, md_texts = ExtractData().extract_text_from_markdown("data/README.md")

In [None]:
print(md_tables)

In [None]:
from unstructured.partition.md import partition_md

# Load and partition a Markdown file
elements = partition_md("data/README.md")

# Extract images
tb, im, tx = ExtractData().parse_elements(elements)

for img in images:
    print("Image metadata:", img.metadata.to_dict())
    print("Image text (alt text if available):", img.text)


In [None]:
import base64
import requests

def urls_to_base64(urls):
    """Fetch images from a list of URLs and convert them to base64."""
    results = {}
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            encoded = base64.b64encode(response.content).decode("utf-8")
            results[url] = encoded
        except Exception as e:
            results[url] = ""
    return results

# Example usage
image_urls = ['https://razorpay.com/blog-content/uploads/2020/06/Logo.svg', 
              'https://d6xcmfyh68wv8.cloudfront.net/blog-content/uploads/2024/02/upi-image-1024x536.webp', 
              'https://razorpay.com/blog-content/uploads/2021/01/upi-intent-payment.png', 
              'https://d6xcmfyh68wv8.cloudfront.net/blog-content/uploads/2021/01/unnamed.png', 
              'https://d6xcmfyh68wv8.cloudfront.net/blog-content/uploads/2021/01/UPI-usage.png', 
              'https://secure.gravatar.com/avatar/47772782b201521accc04b923d55ceec?s=82&d=mm&r=g', 
              'https://d6xcmfyh68wv8.cloudfront.net/blog-content/uploads/2025/08/hugg1-770x515.png',
              'https://d6xcmfyh68wv8.cloudfront.net/blog-content/uploads/2025/08/Slow-Moto-Tours-Blog-1-770x515.png']

base64_images = urls_to_base64(image_urls)

for url, encoded in base64_images.items():
    if encoded.startswith("Error"):
        print(f"⚠️ {url}: {encoded}")
    else:
        print(f"✅ {url} -> Base64 snippet: {encoded[:100]}...")


In [5]:
tables, images, text = ExtractData().extract_text_from_pdf("data/aadhar.pdf")

NameError: name 'ExtractData' is not defined

In [None]:
# from llama_index.core import Document

# text_docs = [Document(page_content=t,metadata={"source": "pdf"},) for t in text]
# table_docs = [Document(page_content=t,metadata={"source": "pdf"},) for t in tables]
# image_docs = [Document(page_content=i,metadata={"source": "pdf"},) for i in images]



In [38]:
print(text_docs[0])

Doc ID: 4a8fd8fd-45dc-43cb-911e-fdced400f209
Text:


In [10]:
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.embeddings.openai import OpenAIEmbedding

import os

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embed_model = OpenAIEmbedding()
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

# also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(documents)

KeyboardInterrupt: 

In [17]:
print(type(nodes[0]))

<class 'llama_index.core.schema.TextNode'>


In [33]:
print(images)

['Image features a close-up photograph of a man with a mustache and short black hair. He is wearing a white shirt and appears to have a neutral expression. The background is a plain light color, enhancing the focus on his face. The image is framed closely around the head and shoulders.', "The image features a headshot of a man with short black hair and glasses. He has a neutral expression and is wearing a light-colored shirt. The background is plain and light, enhancing the visibility of the subject's features. The image is a standard portrait style, focusing on the upper part of the individual, likely for professional or identification purposes.", 'The image depicts a network diagram for the NIC Aadhaar Authentication Services, showcasing various components and their interconnections. It includes multiple data centers and locations, specifically NIC Koramangala, CIDR UIDAI in Bangalore, DC Pune, DC Hyderabad, and DC Shastri Park. \n\nKey elements featured in the diagram are:\n\n- **NI

In [48]:
from uuid import uuid4

from langchain_core.documents import Document

text_docs = [Document(page_content=t,metadata={"source": "pdf"},) for t in text]
table_docs = [Document(page_content=t,metadata={"source": "pdf"},) for t in tables]
image_docs = [Document(page_content=i,metadata={"source": "pdf"},) for i in images]
text_uuids = [str(uuid4()) for _ in range(len(text))]
image_uuids = [str(uuid4()) for _ in range(len(images))]
table_uuids = [str(uuid4()) for _ in range(len(tables))]


In [47]:
# print(documents[0])
print(text_docs[0])

page_content='The text discusses the offerings related to electronic government (eGov) products and services, which are designed to enhance the efficiency and accessibility of government operations through digital solutions.' metadata={'source': 'pdf'}


In [49]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [50]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [51]:
vector_store.add_documents(documents=text_docs, ids=text_uuids)
vector_store.add_documents(documents=image_docs, ids=image_uuids)
vector_store.add_documents(documents=table_docs, ids=table_uuids)

['a031b5ad-e31e-482c-b490-6d947f8f7557']

In [59]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
id_key = "doc_id"
store = InMemoryStore()
retriever = MultiVectorRetriever(
    vectorstore=vector_store,
    docstore=store,
    id_key=id_key,
)
retriever.vectorstore.add_documents(documents=text_docs, ids=text_uuids)
retriever.docstore.mset(list(zip(text_uuids, text)))

ValueError: Tried to add ids that already exist: {'09de9ea7-edc7-4983-8116-219d0054c7ee', '77838f34-84a0-47ad-8261-205c9f3bf0ac', '108e1c25-c401-4147-b0e7-e7f5fe6b03c9', '48ae3f4f-779e-4c85-b0b6-25f0420b4338', '5655d0a1-1935-478a-a1b1-a3b830352a2f', 'a83a072c-7219-4c7d-bd7b-2b4f69ba78d9', '28102c1a-9dfe-43cb-8c87-e82092bda8b0', 'a7a161f8-45c8-4f78-afd8-cb584f77f62b', '744d8215-1354-4d11-884c-8524858a33fe', '41e5e237-dde6-4a11-9ad8-51ae0c05df87', '5bb129de-c9ff-4ad7-ba7c-397dceca5ce7', '6a19be29-d48d-42bb-8707-a8d341db7115', '6c6b9b4c-2e48-4551-9d40-da48c8885ebb', 'ac21e15b-f7a9-4185-92dc-e83983d7093d', 'bc29817e-ec1a-4002-9c8b-8c598ae87977', '7e33d3bc-a655-4deb-b4f5-a3f1d26e1171', '3382d92c-e6ec-43be-aec7-1a9a69c9b4d4', '76b46dfc-234a-454e-9e17-d8011ccc379f', '3458104a-17fb-4186-bfca-4e3c8dfdb2e7', '078deb2f-8e8e-4d55-bfc9-ec2abd302c26', '8e3a26b2-fb5a-457f-a2b1-d3f3b78c7969', 'a5a36cd3-dfa8-412f-b5e2-a49a310456ec', '1d1f5b43-b512-43ef-97c0-9f445f2fc864', 'a2df19a9-25af-4061-a96d-0af3bef4ddc1', 'bcc34099-aae2-47ad-b8e2-0057dab5b6fb', '7a587e35-221c-4d4c-a7be-f3fa947b485c', '883a0146-2e6d-4333-9e3a-b8e6249f88ff', 'e14a3d9e-6f4a-4f83-86d2-f6258fa95e49', 'd499d80e-579e-436c-a036-1b5386a64a87', '962bb628-557a-40e7-b6cf-be830455d370', 'f9753c7d-83a2-4100-9e88-380bb7031abc', '36390fd5-8ce5-43dc-b803-af9d15f80829', '58cbcd8c-4748-4a1b-9342-af7ad618fdd0', 'd8542734-5856-43f0-8c5a-053973c76714', 'c0b72bc7-a649-4445-945d-97c581be4531', 'a8a30536-e3ef-4069-a6ed-513c9019890d', '9293cac9-7fbc-4e1e-8199-a0f07dcb95dd', '68409073-09bc-4f2e-95a6-dc2d76b72f48', '8a8afcb6-07e8-4515-8d0a-c7b560b7b1cc', 'd377f9a0-fb01-4092-9457-bdd8df682122', '322304af-5dd6-4a93-9b7c-e929381eba52', '57ae144e-0624-4429-9975-e4c8ea650706', '25502a33-8f01-46b1-891e-fb97e818da8b', '8722fa94-b23f-4f3e-be5b-f4a47422f454', '45a5be09-79df-45ae-bc37-4f333667ed48', '8ad2ff62-8438-4ada-9d79-3a6254f72730', '81236c7c-9bf8-4bf2-a2c2-5e9e35566b71', 'd26cfc02-c1b2-462f-af85-238c7612526a', '58433983-0b9f-4c9f-806d-7b05a52689c0', '5d9c4c48-3d4e-492c-9b3f-e6a06aab490f', '5ec9698d-8018-46c6-8e03-a528fba0869f', 'ec86fb04-6492-48ad-bc2b-5fb31ee1c3e3', 'a4a7d9f6-6331-4038-9c74-ef572eb17ee4', '29061ed2-ae5c-413a-837a-9b2d14c3e8a3', 'bace77e1-9bd4-400e-bdd3-888b1d155949', '55dfeae9-5b4d-49f1-ac60-6694222e1869', '3ad15cc1-b2ac-4b99-82e0-17b0073b7b83', '0c15e57d-b543-49a0-bb99-26e88f50e53d', '8f1146b7-20cb-4a68-88ce-47a2f51c9e28', 'b2c54703-16c8-4d2b-bac2-9744ae6866b9', '32627d25-e2b5-4b2c-abec-3e808a714bfe', '0a37cfb9-49ec-4bf9-86e9-e792b18a76e9', '9726fde7-2a58-4d97-8a92-12e1bedd5cbe', '536ee00b-535e-4697-942d-0ad219eef6bf', '61624150-13fc-4556-a994-6e3cb6f3610c', '0f31ec0a-fb47-4d61-8900-4e04d16320f8', '8ab29013-3156-40a7-88c4-6f5af7919b3d', 'aabaf05b-97ae-4e79-947d-bbdda527627d', '01b5c1b6-b0a4-42c9-bf85-b40ed443a638', '4829025f-cfa3-4d78-b4cc-7281ae168005', '489efbd8-1619-47c7-8b2a-9b5770ff1c35', '0caf5d44-2d80-4f6b-8c4c-dd22c873206f', 'fec75e98-251f-405e-a1b8-7ae32e0b026b', 'd4b859ae-4958-4653-9f04-6241ca5cb768', '6b7d986c-df30-4e12-8ce9-cc43fb5a59d9', 'd07f3d5d-474f-46af-92b5-4ddf10de2222', '9777ecee-ac19-4b9d-bb43-5bfd3184bd9b', 'e7328381-dba2-4e2f-b853-2e0dd2116c3b', '960ba01c-5ee6-4704-899d-5a156c001010', '136e4d74-0d09-401d-96e5-b25e707a6a85', '1245cf71-745c-48a7-a2e1-3ec06ce31c98', '1a657fc1-21c7-4969-9b95-02fe4baa88c4', '1f5aae2e-f20c-4538-8bcb-f3aed11072ec', '0bb26bd9-f4d5-40c6-9bf1-aaeb701cfaad', '734d59c4-4728-420c-b9e4-c6ec7ef71b2a', 'c22c0766-64d8-4d97-b2d3-98feafd4624a', 'ce1dab42-d9ae-45db-b4ed-e8893f278a6d', '4ab7a31d-8340-46a2-a91b-6a73b4c12605', '0213e681-953b-4d78-a6bf-d6c7225b7343', '911c9839-1498-42ba-b436-7a8cc1c8cf85', '7df2d663-336a-4b62-9b50-8046bbe855f2', 'ed8729af-a03e-46d3-9dc4-79cae027d1bd', 'ce9e18b5-18a4-448c-bfb1-acbe920e1501', '70f69ac9-ffe7-491f-bbaa-3e7a2bd70fdd', '4c44f7c3-dae3-46a7-8f35-478de3b84200', 'b3798711-bf96-4b01-b7b1-565f337fd708', '19adb110-8144-417f-a4b7-ea2c461a60a3', '40d0bd0a-7307-489c-93cc-ef670067c5e4', '589f4ae3-3d21-465d-a92f-381d416187fd', '237f18d7-60f4-484b-adc8-740792c013eb', 'c707004d-e369-40a8-a54e-454f6c6b8d7c', '4e31c832-5d44-4b48-ad90-0d5ffbf59303', '2b38987d-c322-492f-9cba-689b6b2631a0', '23be8db3-d949-4390-9a64-831864c0d4d5', '1a71085a-db0c-4ab0-970b-6ae550cb56ff', 'f85393a8-73ea-45b4-a97f-447a73f21fdf', '2cc864db-fb8c-4af0-a3b6-9a1e50e299cf', '7f4c104a-6494-4ce0-ad47-f0d86f8ecbe7', 'c8ecd992-1cfb-4c2b-adbe-495ac99b883a', 'a44d36f1-26fd-4a17-bd24-32d30f081b08', '83147b1d-7ed8-4019-b735-b900d938096c', '9efb736d-c935-4b75-9da3-600f1b1965f7', '5f2f1d03-b9d2-4e61-9d1c-16b9ca58968d', '4c592441-46e8-4670-98c2-cee73a8448f2', 'a427a7f0-2150-4c44-8d80-e6d76a26c989', '95adb733-34c1-4c4d-9b59-8c26034c2e4d', '668b35de-4766-4b96-ae77-f42bf0c62577', 'b2457ee4-cb58-4cba-9fb3-5c7b14e3dae6', '89c71c02-bebb-43dd-814b-1f32f4185398', 'dbc584b7-aa91-4ba1-8de1-1132f86a68b0', '6b217185-1a14-443d-9202-cae06cf1767d', '6f77fcaa-829d-46e8-befe-bb3e6ca850cc', 'c5274427-8114-414e-8ccb-3bca224726fa', '2a8c407a-4aae-44bf-9402-b5563e06f635', '9a8453aa-fe61-4cde-bf70-8d46cc86ea9d', 'a9b24d0a-6906-40e8-b801-9d6dbf11dbfe', 'ab97f5c3-e0ac-4166-9618-30b94bd158fa', '4566ab43-859f-48b9-92cd-48db926ffc9f', 'c178de04-f683-4c69-87ba-af2699f0ccb5', '5451dd29-6064-4bc5-8dd4-302ed112289d', '86c81e69-ca24-4d75-9738-f4e7c677a41c', '3f1a5730-1aa6-49ae-bd37-0e5ebb91cf3d', 'bb061a34-0062-4100-be3b-f1a4382a9a0c', 'fc0e78b3-2d1b-4fdf-bb61-8de99896a364', '8bdf12e7-1dfb-4201-8f3c-47352f4f9418', 'e795055b-0345-4afa-a8eb-453fedcb5fc5', '5ac37cb3-747c-4062-9cef-c041952c48ad', '3acfd511-b618-45fa-91c3-735a71bcb688', '5cdb9cfd-5c70-4b5b-a142-d0684070084a', '704749d4-fd61-4dc0-b166-14e7f3418fba', '9317a2bf-aea4-4ec8-8333-c366e807e885', '6d8605c8-6979-4ce6-8291-9dc1887bcef5', '5e2716cd-af71-45a7-8a65-126183944a9b', '1d744b93-394d-492c-ab2a-daf141329266', '72234311-0400-4496-b031-bb8ad93bcb57', '04f38b3f-0884-435b-8d06-af58f127049f', 'c32bba2d-94bd-405c-83f3-0b4b4d91c67c', 'f417556f-da83-431b-b071-605f28f62648', '9da49f40-c99c-4834-b134-4a39ae6f254a', '417c4b24-fd55-43ef-b5c6-382a9f77dee2', 'dcac4dae-a412-4f4b-aa4b-177f30ac35d5', '43fddffc-d162-4b90-ab12-1cf689da2f09', '4a6b695c-c697-43ae-b95b-5b4594c06e78', 'a6c9dc8a-a19c-4098-8765-b43edc5d8ba2', '56efc2c5-146f-4b8b-b115-556ad0067a5d', 'e056fc26-103f-4505-a3ec-fd824ececfee', '06d1c4c1-0e7c-48ec-9575-20d4d370e85b', 'bddaf0af-2e63-4412-8655-5a2fe79b5dd3', '7684bf60-b7b1-4609-88fa-4277a0c6a724', '68a1a380-e20e-4a9a-8e2b-3e82fd3ce880', '8f303902-0eb1-49aa-b97b-d366b57cbde6', '95056527-7c5e-43d3-9f03-5a3b5af76960', 'e2ba612f-f9a2-4f24-a7cd-89774aa917f6', '7327b734-5f12-4925-8994-cc470e6b2847', '7d5f891a-a15f-4d9b-bb33-f3f2fd43ab4b', 'c07a69cf-ed62-494b-9db9-2005c8f97607', '508146d9-6f48-4cd7-9f4f-8587ecac7ffc', 'd6bb7a09-a214-4d4e-8e0a-20e77c3c65bc', 'f80e43ff-3174-4c1a-abd3-bc8f927bb641', '2c473a41-5169-49c2-9efe-387d95adba00', '94264cf1-6fa0-4ae3-94f4-9d7740fa8a97', 'baf377f1-cef5-488d-8094-108bcf0fad7f', '6fa1ed36-374f-4393-bf89-9efaf0f70dc4', '317fa646-421a-4763-8d38-245417b3575a', 'bc508b52-1195-480a-b860-840e88074148', '6e04981f-0e1b-4993-906f-6587ec927500', '92b9610c-e7a4-492c-bb32-ca86bce7fdb7', '78ae1117-ff6e-47b9-8386-e66aa0d9692e', '4edf4c93-6bdf-4b0e-9412-49d25c82dd2e', 'f04bd441-85e3-4851-ac74-488bb179bd91', 'aa31c458-7528-4a42-abe3-386b9ba36b5b', '2654316f-d9c7-48dc-b339-b278a71327df', 'b10cded8-302e-43b8-b05d-8069ece33a13', '36a31712-a6b2-4461-8002-e95dc149359a', '7794df55-fc63-4a23-ae6e-7a47e0529f9b', 'b52f6b20-19a5-4cc4-b907-dd1cc8579a5c', '26d68cc9-01f4-416d-8bed-91db13d199f5', '360f1322-8558-4acc-adcc-53a0740b29ad', '19b0ca0e-2388-4abf-bccb-e878445bf04a', '467fc450-a01f-42e0-bd25-b9137c505420', '381174a2-6d99-4c4d-9aee-bcaf71e5f43b', '6afc1d06-4509-4b4c-9d71-7656ac1bd01d', '3ff4f7a6-06ce-4220-bbd5-ed41a009b9bd', '812c9646-bbfe-4850-9a26-a6c47aae54b4', 'ae695624-9854-49e3-85ea-8e98a6a270eb', '19eb60f4-1a7c-4489-acab-920eb91f6137', 'c292b70b-1013-4146-8af2-908dc232388a', '81437b32-66ff-46ab-811b-0a846695c3a6', 'd64dd1cb-7d15-4a74-bb42-5ae2d56fcc24', 'c5707dbf-6772-438a-a6fc-8034b9cfee63', '4067aab7-8333-4c06-8fca-772fd03476f4', 'f54f4364-4ae8-4380-b41e-2d246a042305', '71e4cf9c-c5da-43ef-8fe0-a909c257aa55', '7c3ccb4b-a6e7-4bb8-8ac2-593cc5e06b16', 'a8e1602c-d8bc-4c90-b115-7064079918c7', '095aaac2-9bdc-4d88-bed0-f53fa2c608b2', 'b600cea9-1cda-4d8e-81b6-b669553ec95f', 'fcbfc753-ef69-4bbf-a0ac-5ef657387b67', '768ef5f6-2096-4c40-9ef5-f5219e21a88c', '45ac163d-ca43-462e-8d0a-327aab6dd78b', '7fee7c6d-1abc-4be4-a687-9c4a6ae5bf40', 'b0d4e9f7-d81f-4f8e-aab2-21cd16580f44', 'd9b8d193-e3a7-41bc-8d4d-aa6229cbd30a', 'e0f7995b-23f2-4b5b-a00a-318e890aa60c', '5a5c43ea-30de-404a-a660-ec11a95a7a6d', '00420b51-f455-4a94-93cd-defdd4726a7a', '228d70dc-23a1-4290-8f99-56ba73b6ab61', 'c3423e5d-760d-42ea-99cd-6f7fd5dfb965', '025e2d95-7fa9-410a-9b11-676cb291f6f6', '34285841-7854-4d22-978e-933b72d43e62', '5b3481b2-edec-401d-9c6e-c0875e77a1b2', '17994998-1eea-4600-98e9-40afe8ee1939', '21949cb9-1a21-4c5b-81ea-e4667d9327a0', 'c4571315-03f3-4a56-97fc-cb8b2e9b4dc6', '56a208fa-5d57-4d65-921d-8e8c1dbfc89a', '8adb9f7c-93a0-473d-86e5-bb29b1603119', '725ac0ef-b3aa-4702-89af-862941e74663', '43f6b577-6c0b-4a32-b2b6-bfad92e0f34c', '89f96426-5b97-4162-a1e2-fce107a1b17a', '142b24c0-0878-46a8-9493-47d798a7f47d', '3f946b1d-a026-4284-a763-02a905bb2932', '1d46ce24-595b-4c2e-8e5f-3b55000684bf', '9f33e93f-56f6-45ce-b6d5-64694bc8b5cc', '57f78f80-cbfc-47b8-9de4-cfbb95c22aae', '8e6b1000-ebc5-46a1-b22b-aa2408d60c00', '1931a85c-481f-4de9-91c6-b4348c7a58d2', 'f9884f69-5e7c-434c-a8ce-652f7dc40e94', '51dd9c1b-2ce3-43f8-b783-2725ad3d811c', '79451cf1-498c-4277-a61d-abb4fc7d7c0d', 'c3862dc1-b100-4724-8c09-01aac69e9ab9', '2cfc84ee-4d95-46c8-b2f5-b8871b4fb410', 'e95c0bd3-2bb2-4970-8165-481c38127d27', '3d67423f-6019-4f02-89f3-14c376de98e2', '7bd5198f-d872-4134-a519-e25b25977393', 'd1f6d5e6-5ff8-4b8d-9bf2-76a07a77eea9', '5184a942-f1e5-4d91-8ac7-878a4b805cfb', '737878fe-510f-45df-892d-32e8b3530b3e', '285964e1-d2cc-47f8-b1e5-707fcd444ece', '43724ab8-4dbd-4643-a8f2-38c109aa62bc', 'f9aef9d3-d9c3-46b3-b091-8d19d6228e94', 'c25b8d78-b728-4ae6-8c36-9aedf02d6c33', '6b06a299-9b75-4075-adfa-321ad9a771f1', '03355774-1331-46b3-bb63-6073773e6aa2', '0000bbef-dbee-43ef-8d28-cb90763c894b', '605c8f84-8d63-4fa9-afea-5b993e321b83', '2ce52826-d2b2-4462-ad32-a07885401150', 'c0e20277-e943-4c47-a6e9-8d71259a24ef', '5d08b105-526c-4d9c-b761-f6a748d09cfc', '9913f3b2-5b9c-4be5-86c1-930aab9b3273', 'c8545a40-6ab3-4739-b456-ffbfd809a698', 'bcb223b6-cdde-4bdb-9150-60be2369edeb', '1522a05a-388d-44a5-989e-36e3a1601b95', '9e865084-ef18-4f94-98ba-53d1eae837cb', 'efceb5fc-af2b-489b-96b5-d1718bbed68d', '9d7e877f-9813-43d4-8c02-25ba28c15579', '4b3c8d7f-4e83-441b-b788-ae7c8f7145d3', 'd4d8e7f3-7b52-4144-a893-de8ed83e1966', '6d01932f-3b68-4138-8264-7caf729d1b4f', '79459f2d-913f-4a12-9ccf-485377e3e341', 'd78ec171-7372-4e6d-b710-b6947a65673c', 'fa53ec3c-b38b-4d62-be66-8d815903127a', 'dec04ba4-7e11-4d76-be13-68ff76490b70', '409f1a3f-65e1-4995-915e-d39278ff2940', 'e893c8b0-56eb-426c-b11b-82006e44a955', '6c9c59c7-ce03-441e-83f5-8713df0a1715', '7ff70ed4-f68c-4a59-8326-a1745ad7b82a', '0bb8778a-8fed-41eb-8537-f52a04bf4b11', '77c9cfff-2702-4de1-b739-81675c5d0dd7', 'd48c162b-792f-4659-8881-a4bb9b9c47ac', '23f93144-eeeb-4125-a77b-57fce6d815c2', '45a3d27f-6732-4d4f-a917-8bf00911983d', '3a5f23f2-860a-4879-aa6d-d2d791ea31e0', '4465e9ba-433d-484a-97c7-e31d9e38f431', '08e54f95-66da-449e-a51f-f0014a9f0950', 'a431fbac-aa10-434f-b6c8-3c45ff82fd52', '63f0b4b4-dfcf-467f-8214-13da25b6c50f', '37fd901f-07c5-4222-8b3a-083836552cd1', '503b4455-ff12-4a51-85dd-2d4a5a1dc98a', 'c5d3976c-a232-456f-870d-40715c2df9de', '542debee-c1a0-4ca0-9b0d-315f1b5810f8', '610b0761-fcdb-4987-a66b-813f69de8b50', '77ded314-c577-4698-bc07-aa8e32dfc242', '85d652c8-be18-45e6-bb90-ce26586bc9ba', '73e4e3f4-e01b-4030-a44c-c69f032e3271', 'f22f8cf6-2fd4-431d-a2dc-5ad0e7153afd', '82d99c26-a066-416d-b2bd-e9908d82bd1c', '7fec4793-d64c-4e80-8754-35e6ccd5fb72', 'a7f7863d-0fc8-40ef-a1dc-ef69b4fbb0ac', 'b99d40c0-ff6b-49e3-a12b-480dc5d8946a', 'e3c9012c-4fff-4acf-a7d0-2adf7939b197', '8de9338c-92f9-4bc3-8fda-1af96b4660d2', 'dd467ccc-04ba-4c7b-8cb9-41b4a7c5008e', 'de0b1f54-693d-4918-9f54-86db4e9db357', '5f518af0-874d-441e-bcea-f981985fbb79', 'bde764a1-9d0b-4692-93a2-c2f7bbac83cf', '4e654aca-4781-4244-a71f-9f93e3d6647e', '2da177b1-23a2-4f7b-9688-fdd1761763ea', 'f8858f11-eaa9-466a-80c8-2c59c7c20d20', '1df322b1-e8c3-419c-b48d-5d6fbaa6052f', '23686a09-575d-4ca6-8f70-822d71dd8bc2', 'ea95cb3b-109d-453e-b117-7b51b057fc90', 'eb08c213-ed21-419c-93cd-2241ab8d8951', 'a4707282-055b-4760-aedc-bbb1dfc53ffa', 'e46e01f5-ad3e-48c4-a22a-147b169ae0ce', '9372cf1a-8640-4e6a-aa51-b67daf9ea906', '95172442-d579-4edb-99a2-47129f2c2a5d', '0d447472-e3c5-4ea5-a27e-4e30ac4036cc', '4f77982a-9c0f-472e-9185-bdbe94180a90', '562f158a-35ff-4a0f-a97a-b241c64ea412', 'f95ef7b3-b760-4875-b866-ffcfc7ab091a', 'c633fcc7-1318-49ae-b358-f9a83419f29c', '0bfa5aa7-d758-4b0d-aee8-2ebe3d9e47df', '29731c4a-a357-4d88-8a29-d7569f098ab0', '2dda36b8-b530-4a0c-b5c2-9fe5740c20e3', '961719da-0573-4e9c-9741-af00980df680', '6c344dd5-d3f5-4a5f-a283-37a3893df7ee', '1332a4c2-b13c-4c58-bf53-a0551f36aeb1', 'cdc51f48-d8f7-456c-ae84-34befd68db1b', 'cca5f18f-1328-4112-a53f-9f30f2ed3de3', 'c549e7ae-b156-4075-afb6-0a786d7ca253', 'f92533be-111e-4b8b-844e-3bc8aaf40163', '6afe8933-b322-4c2d-9ea3-0361ba886028', '35a0202b-60fc-49b2-b22c-45948f8e64fd', 'ddb275d1-aaf8-48b1-83f5-41d8d34740e6', 'f4208233-8723-4a20-923b-af745600b804', '94e7a04a-028d-44d5-9a2a-0bebf3e51ec4', '2da43149-d1a0-4436-8bb9-4c29c5ac3ee1', '01c9c063-b7d2-4634-9724-88dc8a53f4eb', '684aa9d4-d043-4243-97e7-b075e5be3723', '8052c3dc-8dc1-4add-b9fa-aa0f618a7845', '9bf47d8d-d60b-4627-898c-6454210e2e49', 'b07ec6ce-16ca-41a4-98b4-b0ee9e179067', '373ff5ed-9c39-4a3b-b403-6b5a7c0075c2', 'bf299531-766a-4eaa-aef2-1e1ee6b3c61b', 'c797f16a-0595-4021-901d-41beb2736dd9', '92c7c06d-36b7-4343-bd78-570a8a197ca4', '8937c72a-e92c-4d85-9119-81bdcb69b93b', 'ce9cc590-3fac-4a57-ba32-cfa62c1f6190', 'dff04fb4-3860-4c12-b9d2-ffe7130eef5b', 'd736ac65-e9b3-49b3-bf84-ff9958de6614', '49b4d387-ee2f-4b06-8a67-7ce549472c49', 'b83da262-c926-4b12-bba0-bd1b89a20cf3', '74e55651-9f7f-4de7-967a-5aa5e0a74610', '1318d38c-db6f-4de2-8de1-c939e692bc55', '529d477e-8e86-404d-add2-6cdd8f4223b6', 'e9bd1b7f-dac1-425e-be02-79064f423014', '2f4e310f-6d1f-4272-a0c3-0b28dca0f028', '3198ff61-3ae4-4a97-ab14-94db7c88e21d', 'ca687a96-6cb6-4579-a6f3-b4cb72b8cccf', '6bf914c7-2948-4bfc-9dcd-24715f68580b', '3e76a80e-6ce8-4bc3-8606-9c39078a0ffe', 'aa22fb31-0ea1-499f-8309-8b1cb7422f5e', 'bdb3bb8d-129b-4d76-95ee-814229146a1b', 'a16a8635-4495-4620-89a4-171c69a80c2c', '07803e3a-b1a4-4676-abe3-c096afc381a3', 'f3425293-dcc1-423c-8068-f980d18500c3', '0fcf8cbb-b616-44ed-bf9d-abb27da78d47', '08137f4e-c34d-46a2-b594-f0236f18c489', '13d483e6-9343-4518-9ed7-90c6d66c3ee9', '8d058268-89ea-40c6-a491-42c28c7f2335', '611c12ff-eb93-4c0c-b8cb-3284a3dd8f9e', '62513a1a-c996-4e19-bd83-01c93dd8ff83', '0c438a84-30ff-411f-91b3-2249dd9a05e5', 'e6ff15f2-19b7-49a7-924f-94cd348f9af8', 'cc5bef8c-8459-4566-a2a2-9de9b07ec8b0', 'a6bf8014-b789-4c88-a64a-2c149c66b7b8', 'cfa1dc56-f265-4731-9e43-7a5d12666760', '320589e0-65d7-4015-bf11-510325a7a9ee', '8941d7db-d757-49f1-8b00-6707cd7d0bbc', '7111a20f-e681-49eb-a439-8d8e580af83b', '8bde1edf-1f11-4422-8460-52eca32a4e93', '38f5a89f-c325-445d-a21c-22dd3f118bc4', 'a91d2887-b456-42b4-9d8a-6a9b28220705', 'd5dbe8e5-c1fa-41f3-b1a7-9a231b018c6c', '793e4b23-aac7-48ea-94b9-2cf511d40f89', '670f986b-0a1c-47f8-9085-f3ba367500b7', '7891c26a-de39-4db8-9450-1c607553ed4d', '87f693dc-f3c2-4308-8fea-2d79f8d8ca4a', '02e46c8e-9bd7-4339-ab7c-417b418f4508', '78413de2-1261-4c17-81a9-5827c108e9b9', 'e72ba24f-eae6-4cf4-a363-aaa5daa3d397', '1d27aaa5-6836-41b3-82ad-8d5bdb9a23ac', '41131cd7-9283-4b54-a042-f3fa5d398f2a', '249a1cd9-c6b9-448a-9e4d-6c3d1f6fa02b', '083f1110-936a-4bf1-8dd5-a862174f7c79', '12d9fae5-6b69-49d2-948f-11dc3ceaa685', '0b226d52-10a4-456a-8565-88decb829394', '41d48c64-685f-486b-9ab6-654f8d3d5ec8', 'fac78383-3dbd-44be-b29e-61a59431e434', '3ec1d555-ab05-437f-bf92-3c8270c04e0b', '636d63bd-89f4-4e28-83f2-aaa1401c44a1', '75f3f1eb-7189-4a2f-ab2b-744e7093d17d', 'd09ae932-c2a2-4945-a5ef-45938126c4a6', 'ed95dc2f-890c-4ca9-9b1c-1170bda2b0fe', '47851adb-11ed-40c3-8a20-f3ab216bc739', 'a577bd3b-be8f-4bed-bff2-727bee850fca', '802f10f9-7c88-42b6-9229-bbe151c90991', '5eaa2e55-c45d-4ea1-97c5-388551c31575', '86ac01f0-f1d6-43f2-9d9a-087d6e18e36c', 'da4721c6-78fe-4fe9-9381-7c6fe4ae36cf', 'b60b6d5a-a838-4cfe-a603-76c36a6c5b67', 'cbc80aa8-c0f7-4d69-bf3b-2ffa4c5736fb', '294f05d9-ae38-465c-8107-fa6fed25caad', 'cb83a6e8-bfef-4e64-8e1b-35681587c46a', '21e6fcd2-af8d-4c28-91aa-ca81fdf74b17', '32f9fee3-e0bb-4c2a-b4e5-00604459b258', '7749cc6f-fa02-4a51-b718-a9232436011a', 'f0c2b887-bf18-4cc1-83f0-5071092b6370', '673a9adb-35fe-4771-8ccb-40d0b84e8f95', 'fa6adbf6-6d88-4d2c-ab7c-5187b9877a63', '4fcb1639-2b8b-4e8d-98d5-13dd4f3f06ef', '1e0f06eb-3ea3-4bd4-90ae-8e114944c8f2', '393daa08-d1d0-4e23-b6dd-d393eed952bd', '65f08542-9482-4f44-a231-655283a71742', '0f83fcfd-9899-4dfc-b326-7ac6b46a70fe', '258ac8ec-dd55-4064-b8b0-84115508352b', '0e7c8eab-f22e-4be5-a292-42c2103c1b04', 'e7e369e6-ca14-4f53-8033-6360c8e5d512', 'e1524696-4a76-471f-b3b1-21ee1a1a0c73', '954b776b-e246-45c4-b5b2-b33b666a624a', '1d366415-2ba4-47ab-983a-bb88abe6ca49', '818056bc-2a06-424b-86f0-23b5176a968a', '16595509-93a2-43bd-9658-818d80cbba5c', '7d7fd049-5344-4a58-a762-38faddc66099', 'ee2070b4-0c2f-4712-bf66-70c797b4dd68', '01861283-3249-474d-a9d8-0586311d8d59', '3e2cc848-21b1-4a8a-9b1c-87f03813f909', '4cd0146f-f680-4c43-acfe-9612429a91b4', 'c4306840-5617-40d8-b94d-d8eeeb0e8ca7', 'aabe5ca9-cd0d-4e1f-89e6-742acebafb6d', '26c0edf3-b2e6-4798-8a07-3c5ef1681c9e', 'e56d9f1f-0922-468a-a968-953d1d60bf9d', '66936201-8514-4704-92f9-7ddd0685e44b', '5ee87afd-b49b-4494-b402-3b8afe17522b', '75d824cc-71be-454b-b70b-0c12776a12d1', '9a3313bf-10d9-438c-a2e6-8c660a6e0bb7', '9c65711c-57b6-447e-9c6c-097fe21ec367', '770ba42c-60dd-419d-8cad-6abc4f0feda8', 'e10a0eaa-4547-4794-90e8-cdbf7ce47532', '95d2c43e-50cf-4e2e-b5e3-572f1f35bb8f', 'daa908e8-cb67-4d80-a896-3481c96cd894', '314ab9c6-aec6-40e0-89de-41e3dc1e555a', '3969ba8d-60b7-4ef4-871c-8151a9085f78', '1474367a-9832-4378-9d09-75132f445c88', '45eb50dd-ebca-4210-bf9a-2215391e0a5f', '996e7527-0ad2-4735-89e9-ff2f46add0fa'}

In [60]:
results = vector_store.similarity_search(
    "on boarding process",
    k=1
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* The on-boarding process refers to the procedures and activities designed to integrate new employees into an organization. It typically includes orientation, training, and introduction to company culture and policies, aiming to ensure a smooth transition and enhance employee engagement and productivity. [{'source': 'pdf'}]


In [55]:
# for image_doc in image_docs:
#     print(image_doc.page_content)  # Print first 100 characters of each image document
image_docs[-1]

Document(metadata={'source': 'pdf'}, page_content='The image features a triangular structure with a list of various programs or initiatives organized vertically. Each program is represented with alternating colors—top sections in yellow and bottom sections in teal. The listed items from top to bottom are:\n\n1. Attendance\n2. SERVAM (NPR)\n3. ePDS (States)\n4. Scholarship\n5. Swachh Bharat Mission\n6. MSME\n7. Indian Army\n8. e-Panchayat (AP)\n9. CBSE, NET\n10. Pradhan Mantri Awas Yojna\n\nThe design emphasizes a hierarchical structure, possibly indicating a prioritization or categorization of these programs. The triangular format suggests an upward trend or importance of the listed initiatives.')

In [57]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1})
retriever.invoke("ELIGIBILITY FOR GETTING AUTHENTICATION SERVICES")

[Document(id='734d59c4-4728-420c-b9e4-c6ec7ef71b2a', metadata={'source': 'pdf'}, page_content='The text discusses the criteria required for individuals or entities to qualify for authentication services.')]