In [1]:
import os 
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

try: 
    load_dotenv()
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    print("Environment variables loaded successfully.")
except Exception as e:
    print(f"Error loading environment variables: {e}")
    
model = ChatOpenAI(model="gpt-4o-mini", temperature=0.8)

Environment variables loaded successfully.


In [None]:
from pypdf import PdfReader
from unstructured.partition.pdf import partition_pdf
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from unstructured.partition.html import partition_html
from unstructured.partition.md import partition_md
from unstructured.chunking.title import chunk_by_title


class ExtractData:
    
    def extract_text_from_pdf(self, pdf_path):
        try:
            
            elements = partition_pdf(
            filename=pdf_path,                  # mandatory
            strategy="hi_res",
            extract_images_in_pdf=True,                            # mandatory to set as ``True``
            extract_image_block_types=["Image", "Table"],          # optional
            extract_image_block_to_payload=True,    
            infer_table_structure=True,# optional
            extract_image_block_output_dir="images/",  # optional - only works when ``extract_image_block_to_payload=False``
            languages=["eng"],                           # optional
            )
            
            tables, images, texts = self.parse_elements(elements)
            tables_summary = self.create_summaries_of_tables(tables)
            texts_summary = self.create_summaries_of_texts(texts)
            images_summary = self.create_summaries_of_images(images)
            return tables_summary, images_summary, texts_summary

        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return [], [], []
    
    def extract_text_from_html(self, html_path):
        try:
            # From filename
            elements = partition_html(filename=html_path)

            # From file object
            with open(html_path, "r") as f:
                elements = partition_html(file=f)

            # From text string
            with open(html_path, "r") as f:
                text = f.read()
            elements = partition_html(text=text)

            tables, images, texts = self.parse_elements(elements)
        
        except Exception as e:
            print(f"Error in Extraction of HTML : {str(e)}")
            return [], [], []
                
        
    
    def extract_text_from_markdown(self, md_path):
        try:
            # From filename
            elements = partition_md(filename=md_path)

            # # From file object
            with open(md_path, "r") as f:
                elements = partition_md(file=f)

            # # From text string
            with open(md_path, "r") as f:
                text = f.read()
            elements = partition_md(text=text)
            tables, images, texts = self.parse_elements(elements)
            return tables, images, texts
        except Exception as e:
            print(f"Error in Extraction of Markdown : {str(e)}")
            return [], [], []
    
    def parse_elements(self, elements):
        try:
            tables = []
            images = []
            texts = []

            # Save image and table elements
            for i, element in enumerate(elements):
                # print(f"{i}. Type: {element.category}")
                
                if element.category == "Image":
                    #print(element.metadata.image_base64)
                    images.append(element.metadata.image_base64)
                
                elif element.category == "Table":
                    #print("Table HTML:")
                    tables.append(element.metadata.text_as_html)
                else:
                    texts.append(element.text)
            
            return tables, images, texts
        
        except Exception as e:
            print(f"Error Parsing Elements: {e}")
            return [], [], []
        
    
    def create_summaries_of_tables(self, tables):
        # Prompt
        prompt_text = """You are an assistant tasked with summarizing tables. \
        Give a concise summary of the table. Table chunk: {element} """
        prompt = ChatPromptTemplate.from_template(prompt_text)

        # Summary chain
        summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
        return table_summaries

    def create_summaries_of_texts(self, texts):
        # Prompt
        prompt_text = """You are an assistant tasked with summarizing texts. \
        Give a concise summary of the text. Text chunk: {element} """
        prompt = ChatPromptTemplate.from_template(prompt_text)

        # Summary chain
        summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
        return text_summaries

    def create_summaries_of_images(self, images):
        prompt_template = """You are an assistant tasked with summarizing images for retrieval.
                Remember these images could potentially contain graphs, charts or 
                tables also.
                These summaries will be embedded and used to retrieve the raw image 
                for question answering.
                Give a detailed summary of the image that is well optimized for 
                retrieval.
                Do not add additional words like Summary: etc.
             """
        messages = [
            (
                "user",
                [
                    {"type": "text", "text": prompt_template},
                    {
                        "type": "image_url",
                        "image_url": {"url": "data:image/jpeg;base64,{image}"},
                    },
                ],
            )
        ]

        prompt = ChatPromptTemplate.from_messages(messages)

        chain = prompt | model | StrOutputParser()

        image_summaries = chain.batch(images)
        return image_summaries

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from langchain_core.documents import Document

text_docs = [Document(page_content=t,metadata={"source": "pdf"},) for t in text]

In [77]:
from pypdf import PdfReader
from unstructured.partition.pdf import partition_pdf
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from unstructured.partition.html import partition_html
from unstructured.partition.md import partition_md
from unstructured.chunking.title import chunk_by_title
import os

file_name = "aadhar.pdf"
folder_name = file_name[:-4]
#Create folder with file_name
if not os.path.exists(f"data/{folder_name}"):
    os.makedirs(f"data/{folder_name}")

elements = partition_pdf(
            filename=f"data/{file_name}",                  # mandatory
            strategy="hi_res",
            extract_images_in_pdf=True,                            # mandatory to set as ``True``
            extract_image_block_types=["Image"],          # optional
            extract_image_block_to_payload=True,    
            infer_table_structure=True,# optional
            extract_image_block_output_dir=f"data/{folder_name}/",  # optional - only works when ``extract_image_block_to_payload=False``
            languages=["eng"],                           # optional
            )

chunks = chunk_by_title(elements, max_characters=4000, overlap=200)

In [91]:
images_b64 = []
tables = []
texts = []
for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            
            texts.append(chunk.text)
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
               
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
                elif "Table" in str(type(el)):
                    tables.append(el.metadata.text_as_html)
                
                
# print(len(images_b64))
# print(tables)
print(len(texts))
for text in texts:
    print("=======================================================")
    print(text[:20])
    print("=======================================================")

6
eGov Products & Serv
AUTHENTICATION USING
Application, An MOU 
AUA’s application mu
| | informatics.nic.
ON-BOARDING PROCESS



In [74]:
table_chunks = [chunk for chunk in chunks if "<table" in chunk.text.lower()]

In [75]:
print(table_chunks)

[]


In [76]:
tables = []
for i, element in enumerate(elements):
    if element.category == "Table":
        tables.append(element)
print(len(tables))
print(tables)

1
[<unstructured.documents.elements.Table object at 0x000001A3D92CEED0>]


In [69]:
def parse_elements(chunks):
    try:
        tables = []
        texts = []

        for chunk in chunks:
            if "Table" in str(type(chunk)):
                tables.append(chunk)

            if "CompositeElement" in str(type((chunk))):
                texts.append(chunk.text)
        return tables, texts
    except Exception as e:
        print(f"Error Parsing Elements: {e}")
        return [], []

In [48]:
chunks[1].to_dict()

{'type': 'CompositeElement',
 'element_id': '8568050f-79af-4b13-8429-eae8d254d182',
 'text': 'Edited by MOHAN DAS VISWAM\n\nN\n\nIC signed an agreement with UIDAI for Authenti- cation User Agency/ Authentication Service Agency. Further a dedicated, structured\n\nplatform has been designed and devel- oped for e-Governance projects under Digital India initiative. Initially authentication service was started for Biometric Attendance System (BAS). A cloud based infrastructure was setup at National Data Centre, Shastri Park, New Delhi with dedicated 34 MBPS secure, redundant connectivity between National Data Centre and UIDAI Data Centre for authentication. NIC has further furnished dedicated secure connectivity between National Data Centre and UIDAI Data Centre. the',
 'metadata': {'file_directory': 'data',
  'filename': 'aadhar.pdf',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'last_modified': '2025-09-10T11:38:01',
  'page_number': 1,
  'orig_elements': 'eJzdVdtu20YQ/ZUBn1

In [70]:
tables, texts = parse_elements(chunks)

In [72]:
for text in texts:
    print(text)

eGov Products & Services

AADHAAR AUTHENTICATION PLATFORM Enabling Digital Governance with Citizen’s Identity

The Authentication Services is

integrated with 110 plus applications of States & Central Government Departments and Ministry with the monthly transaction of around 12 Crores in the month of November 2017. Indian Army, Public Distribution System, Digital Locker, CBSE, Jeevan Pramaan, Biometric Attendance System are some of the applications which have used the services for establishing the Digital Identity of Citizens to get the benefits.

RAJIV RATHI Sr. Technical Director rajiv@nic.in

m

AJAY SINHA Technical Director ak.sinha@gov.in

Edited by MOHAN DAS VISWAM

N

IC signed an agreement with UIDAI for Authenti- cation User Agency/ Authentication Service Agency. Further a dedicated, structured

platform has been designed and devel- oped for e-Governance projects under Digital India initiative. Initially authentication service was started for Biometric Attendance System (BAS).