In [1]:
#llm.py
import os 
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

try: 
    load_dotenv()
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    print("Environment variables loaded successfully.")
except Exception as e:
    print(f"Error loading environment variables: {e}")
    
model = ChatOpenAI(model="gpt-4o-mini", temperature=0.8)

Environment variables loaded successfully.


In [81]:
#extraction.py
import base64
import os
import requests
import logging
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.html import partition_html
from unstructured.partition.md import partition_md
from unstructured.chunking.title import chunk_by_title


class ExtractData:
    
    def extract_data_from_pdf(self, pdf_path):
        
        try:
            

            elements = partition_pdf(
                        filename=pdf_path,                  # mandatory
                        strategy="hi_res",
                        extract_images_in_pdf=True,                            # mandatory to set as ``True``
                        extract_image_block_types=["Image"],          # optional
                        extract_image_block_to_payload=True,    
                        infer_table_structure=True,# optional
                        #extract_image_block_output_dir=f"data/{folder_name}/",  # optional - only works when ``extract_image_block_to_payload=False``
                        languages=["eng"],                           # optional
                        )
            
            chunks = chunk_by_title(elements, max_characters=4000, overlap=200)

            images = []
            tables = []
            texts = []
            for chunk in chunks:
                if "CompositeElement" in str(type(chunk)):
                    texts.append(chunk.text)
                    chunk_els = chunk.metadata.orig_elements
                    for el in chunk_els:
                        if "Image" in str(type(el)):
                            images.append(el.metadata.image_base64)
                        elif "Table" in str(type(el)):
                            tables.append(el.metadata.text_as_html)


            return tables, images, texts
            
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return [], [], []
    
    def extract_data_from_html(self, html_path):
        try:
            elements = partition_html(
                                filename=html_path,                  # mandatory
                                extract_image_block_to_payload=True,
                                extract_image_block_types=["Image"],
                                infer_table_structure=True,)
            chunks = chunk_by_title(elements, max_characters=4000, overlap=200)
            tables = []
            images_url = []
            texts = []
            for chunk in chunks:
                if "CompositeElement" in str(type(chunk)):
                    texts.append(chunk.text)
                    chunk_els = chunk.metadata.orig_elements
                    for el in chunk_els:
                        if "Image" in str(type(el)):
                            images_url.append(el.metadata.image_url)
                        elif "Table" in str(type(el)):
                            tables.append(el.metadata.text_as_html)
            images = self.urls_to_base64(images_url)
            images = [img for img in images if img not in ["", None]]
            return tables, images, texts
        except Exception as e:
            print(f"Error in Extraction of HTML : {str(e)}")
            return [], [], []
                
    def extract_data_from_md(self, md_path):
        try:
            
            elements = partition_md(filename=md_path)
            chunks = chunk_by_title(elements, max_characters=4000, overlap=200)

            html = []
            texts = []
            images = []
            for chunk in md_chunks:
                if "CompositeElement" in str(type(chunk)):
                    texts.append(chunk.text)
                    chunk_els = chunk.metadata.orig_elements
                    for el in chunk_els:
                        if "Image" in str(type(el)):
                            images.append(el.metadata.image_url)
                        elif "Table" in str(type(el)):
                            html.append(el.metadata.text_as_html)
            
            images = self.urls_to_base64(images)
            images = [img for img in images if img not in ["", None]]
            return html, images, texts
        except Exception as e:
            print(f"Error in Extraction of MD : {str(e)}")
            return [], [], []
        
    def urls_to_base64(self, urls):
        """Fetch images from a list of URLs and convert them to base64."""
        results = []
        for url in urls:
            try:
                response = requests.get(url)
                response.raise_for_status()
                encoded = base64.b64encode(response.content).decode("utf-8")
                results.append(encoded)
            except Exception as e:
                results.append("")
        return results
    
    

In [82]:
from src.llm import model
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

class SummarizeData:
    
    def create_summaries_of_tables(self, tables):
        # Prompt
        prompt_text = """You are an assistant tasked with summarizing tables. \
        Give a concise summary of the table. Table chunk: {element} """
        prompt = ChatPromptTemplate.from_template(prompt_text)

        # Summary chain
        summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
        return table_summaries
    
    def create_summaries_of_images(self, images):
        prompt_template = """You are an assistant tasked with summarizing images for retrieval.
                Remember these images could potentially contain graphs, charts or 
                tables also.
                These summaries will be embedded and used to retrieve the raw image 
                for question answering.
                Give a detailed summary of the image that is well optimized for 
                retrieval.
                Do not add additional words like Summary: etc.
             """
        messages = [
            (
                "user",
                [
                    {"type": "text", "text": prompt_template},
                    {
                        "type": "image_url",
                        "image_url": {"url": "data:image/jpeg;base64,{image}"},
                    },
                ],
            )
        ]

        prompt = ChatPromptTemplate.from_messages(messages)

        chain = prompt | model | StrOutputParser()

        image_summaries = chain.batch(images)
        return image_summaries

In [4]:
pdf_tables, pdf_images, pdf_texts = ExtractData().extract_data_from_pdf("data/aadhar.pdf")

Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [6]:
pdf_table_summaries = SummarizeData().create_summaries_of_tables(pdf_tables)
pdf_image_summaries = SummarizeData().create_summaries_of_images(pdf_images)

In [11]:
from langchain_core.documents import Document
from uuid import uuid4

pdf_text_docs = [Document(page_content=t,metadata={"source": "pdf"},) for t in pdf_texts]
pdf_table_docs = [Document(page_content=t,metadata={"source": "pdf_table"},) for t in pdf_table_summaries]
pdf_image_docs = [Document(page_content=t,metadata={"source": "pdf_image"},) for t in pdf_image_summaries]

pdf_text_uuids = [str(uuid4()) for _ in range(len(pdf_text_docs))]
pdf_table_uuids = [str(uuid4()) for _ in range(len(pdf_table_docs))]
pdf_image_uuids = [str(uuid4()) for _ in range(len(pdf_image_docs))]

In [2]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
print(len(embeddings.embed_query("hello world")))
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
print(index)
# vector_store = FAISS(
#     embedding_function=embeddings,
#     index=index,
#     docstore=InMemoryDocstore(),
#     index_to_docstore_id={},
# )

3072
<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001E97D3BFA80> >


In [48]:
vector_store.add_documents(documents=pdf_text_docs, ids=pdf_text_uuids)
vector_store.add_documents(documents=pdf_table_docs, ids=pdf_table_uuids)
vector_store.add_documents(documents=pdf_image_docs, ids=pdf_image_uuids)

['7a14b431-6477-457a-a0ff-19e8ef28757e',
 '910fec11-f5c2-4449-afbd-78fbdae9d8ec',
 '60335913-394c-4645-9310-ec4e0f67b4a1',
 '8df2d77b-b130-4435-ab9d-13505eb1d7d0',
 '7c8cd97b-d9a9-4701-bd52-95f068ed8c41',
 '15b46766-a0c7-4c42-bd35-1bbcb7642f67']

In [50]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 2})
retriever.invoke("Onboarding Process")

[Document(id='a7cbd2fb-7325-4a07-8dc7-9e756d54a07f', metadata={'source': 'pdf'}, page_content="ON-BOARDING PROCESS\n\nSub AUAs are agencies that use Aadhaar authentication to enable its services\n\n‚Ä¢‚Ä¢Provide the Sub AUA the necessary credentials and the URL on which xml is to be posted.\n\nAttendance SERVAM (NPR) ePDS (States) Scholarship Swachh Bharat Mission MSME Indian Army e-Panchayat (AP) CBSE, NET Pradhan Mantri Awas Yojna\n\nBiometric Aadhaar Authentication\n\nMinistry/ Departments / Organisations\n\nsecurity, logging etc.\n\n‚Ä¢‚Ä¢Sub AUA does the testing on Pre-production environment.\n\nsystem to authenticate residents using\n\nwho are willing to use AUA platform of\n\nbiometric attributes (Fingerprint, Iris).\n\nNIC\n\nfor Aadhaar Authentications\n\nUIDAI has made the Authentication\n\n‚Ä¢‚Ä¢After successful Authentication of the Application, An MOU is signed between AUA and Sub-AUA users\n\nE-Governance applications can use OTP\n\nServices shall act as Sub AUA for their

In [77]:
from unstructured.partition.html import partition_html
from unstructured.chunking.title import chunk_by_title

html_elements = partition_html(
                                filename="data/upi.html",
                                extract_image_block_to_payload=True,
                                extract_image_block_types=["Image"],
                                infer_table_structure=True,)
html_chunks = chunk_by_title(html_elements, max_characters=4000, overlap=200)
html_tables = []
html_images = []
html_texts = []
for chunk in html_chunks:
    if "CompositeElement" in str(type(chunk)):
        html_texts.append(chunk.text)
        chunk_els = chunk.metadata.orig_elements
        for el in chunk_els:
            
            if "Image" in str(type(el)):
                html_images.append(el.metadata.image_url)
            elif "Table" in str(type(el)):
                html_tables.append(el.metadata.text_as_html)
    
print(html_tables)
print(html_images)
print(html_texts)


['<table><tr><td>UPI App/PSPs</td><td>Sponsor Banks</td><td>Handles</td></tr><tr><td>Google Pay</td><td>Axis</td><td>@okaxis</td></tr><tr><td>ICICI</td><td>@okicici</td></tr><tr><td>HDFC</td><td>@okhdfcbank</td></tr><tr><td>SBI</td><td>@oksbi</td></tr><tr><td>Phonepe</td><td>Yes</td><td>@ybl</td></tr><tr><td>ICICI</td><td>@ibl</td></tr><tr><td>Axis</td><td>@axl</td></tr><tr><td>Amazon Pay</td><td>Axis</td><td>@apl</td></tr><tr><td>WhatsApp Payments</td><td>ICICI Bank</td><td>@okicici</td></tr><tr><td>Airtel Payments Bank</td><td>Kotak Mahindra Bank</td><td>@Kotak</td></tr><tr><td>Paytm</td><td>ICICI Bank</td><td>@okicici</td></tr><tr><td>BHIM (Bharat Interface for Money)</td><td>National Payments Corporation of India (NPCI)</td><td>@upi</td></tr></table>']
['https://razorpay.com/blog-content/uploads/2020/06/Logo.svg', 'https://d6xcmfyh68wv8.cloudfront.net/blog-content/uploads/2024/02/upi-image-1024x536.webp', 'https://razorpay.com/blog-content/uploads/2021/01/upi-intent-payment.png', '

In [74]:
html_tables, html_images, html_texts = ExtractData().extract_data_from_html("data/upi.html")
print(html_tables)
for img in html_images:
    print(img[:10])
print(html_texts)

['<table><tr><td>UPI App/PSPs</td><td>Sponsor Banks</td><td>Handles</td></tr><tr><td>Google Pay</td><td>Axis</td><td>@okaxis</td></tr><tr><td>ICICI</td><td>@okicici</td></tr><tr><td>HDFC</td><td>@okhdfcbank</td></tr><tr><td>SBI</td><td>@oksbi</td></tr><tr><td>Phonepe</td><td>Yes</td><td>@ybl</td></tr><tr><td>ICICI</td><td>@ibl</td></tr><tr><td>Axis</td><td>@axl</td></tr><tr><td>Amazon Pay</td><td>Axis</td><td>@apl</td></tr><tr><td>WhatsApp Payments</td><td>ICICI Bank</td><td>@okicici</td></tr><tr><td>Airtel Payments Bank</td><td>Kotak Mahindra Bank</td><td>@Kotak</td></tr><tr><td>Paytm</td><td>ICICI Bank</td><td>@okicici</td></tr><tr><td>BHIM (Bharat Interface for Money)</td><td>National Payments Corporation of India (NPCI)</td><td>@upi</td></tr></table>']
PHN2ZyB3aW
UklGRgJvAA
iVBORw0KGg
iVBORw0KGg
iVBORw0KGg
iVBORw0KGg
iVBORw0KGg
iVBORw0KGg
['Razorpay Blog\n\nIn Payments\n\nWhat is UPI? Unified Payments Interface Features and How UPI Works\n\nMarch 24, 2025 21 Mins Read\n\nHow UPI Wo

In [80]:
from unstructured.partition.md import partition_md
from unstructured.chunking.title import chunk_by_title

md_elements = partition_md(filename="data/README.md")

md_chunks = chunk_by_title(md_elements, max_characters=4000, overlap=200)

md_html = []
md_texts = []
md_images = []
for chunk in md_chunks:
    if "CompositeElement" in str(type(chunk)):
        md_texts.append(chunk.text)
        chunk_els = chunk.metadata.orig_elements
        for el in chunk_els:
            if "Image" in str(type(el)):
                md_images.append(el.metadata.image_url)
            elif "Table" in str(type(el)):
                md_html.append(el.metadata.text_as_html)
print(md_html)
print(md_images)
print(md_texts)

['<table><tr><td>Component</td><td>Technology Used</td><td>Version</td><td>Notes</td></tr><tr><td>Backend</td><td>Python (FastAPI)</td><td>3.12</td><td>REST API framework</td></tr><tr><td>Frontend</td><td>React.js</td><td>18.2</td><td>Responsive UI</td></tr><tr><td>Database</td><td>PostgreSQL</td><td>16</td><td>For structured transaction data</td></tr><tr><td>Cache</td><td>Redis</td><td>7</td><td>Session management &amp; rate limiting</td></tr><tr><td>Message Queue</td><td>RabbitMQ</td><td>3.13</td><td>Async event processing</td></tr><tr><td>Containerization</td><td>Docker</td><td>25.0</td><td>For deployment portability</td></tr><tr><td>Orchestration</td><td>Kubernetes</td><td>1.30</td><td>Cluster management</td></tr><tr><td>Monitoring</td><td>Prometheus + Grafana</td><td>Latest</td><td>Metrics and dashboards</td></tr></table>']
['https://fastapi.tiangolo.com/img/logo-margin/logo-teal.png', 'https://github.com/fastapi/fastapi/actions/workflows/test.yml/badge.svg?event=push&branch=maste