### PDF Parsing


In [2]:
from langchain_community.document_loaders import(
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader,
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pypdfLoader = PyPDFLoader("data/pdf/sample.pdf")
pypdfDocuments = pypdfLoader.load()
print(f"{pypdfDocuments}")
print(f"pypdf has {len(pypdfDocuments)} documents") 

[Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-10T17:59:35+00:00', 'source': 'data/pdf/sample.pdf', 'total_pages': 102, 'page': 0, 'page_label': '1'}, page_content='Unit–III(MONEYANDTEAM)\nRevenueStreams-Basicsofhowcompaniesmakemoney\nCompanies make money throughrevenuestreams, which are the different ways they generate income. Here\nare thebasicrevenuestreams businesses use:\n1.ProductSales 🛍️\nCompanies sell physical or digital products directly to consumers.\nExample:\n\ue12c Apple sells iPhones and MacBooks.\n\ue12c Nike sells shoes and apparel.\n2.ServiceRevenue 💼\nBusinesses earn money by providing services instead of products.\nExample:\n\ue12c A consulting firm charges for advisory services.\n\ue12c A gym charges membership fees.\n3.SubscriptionModel 🔄\nCustomers pay a recurring fee (monthly or yearly) for access to a product or service.\nExample:\n\ue12c Netflix charges a monthly fee for streaming.\n\ue12c Spotify of

In [None]:
## Better Approach

pypdfLoader = PyMuPDFLoader("data/pdf/sample.pdf")
pypdfDocuments = pypdfLoader.load()
print(f"{pypdfDocuments}")
print(f"pypdf has {len(pypdfDocuments)} documents") 

[Document(metadata={'producer': 'Pdftools SDK', 'creator': '', 'creationdate': '', 'source': 'data/pdf/sample.pdf', 'file_path': 'data/pdf/sample.pdf', 'total_pages': 102, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-09-10T17:59:35+00:00', 'trapped': '', 'modDate': 'D:20250910175935Z', 'creationDate': '', 'page': 0}, page_content='Unit – III (MONEY AND TEAM)\nRevenue Streams - Basics of how companies make money\nCompanies make money through revenue streams, which are the different ways they generate income. Here\nare the basic revenue streams businesses use:\n1. Product Sales 🛍️\nCompanies sell physical or digital products directly to consumers.\nExample:\n\ue12c\nApple sells iPhones and MacBooks.\n\ue12c\nNike sells shoes and apparel.\n2. Service Revenue 💼\nBusinesses earn money by providing services instead of products.\nExample:\n\ue12c\nA consulting firm charges for advisory services.\n\ue12c\nA gym charges membership fees.\n3. Sub

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [20]:
from langchain_core.documents import Document
from typing import List
class SmartPDFProcessor:
    def __init__(self,chunk_size=1000,chunk_overlap=40):
        self.chunk_size = chunk_size,
        self.chunk_overlap = chunk_overlap,
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            separators= [" "]
        )
    
    def process_pdf(self,pdf_path)->List[Document]:
        pdf_loader = PyPDFLoader("data/pdf/sample.pdf")
        pdf_docs = pdf_loader.load()

        processed_chunks = []

        for page_num,page in enumerate(pdf_docs):
            cleaned_text = self.clean_text(page.page_content)
            if(len(cleaned_text.strip())<50):
                continue

            chunks = self.text_splitter.create_documents(
                texts = [cleaned_text],
                metadatas = [{
                    **page.metadata,
                    "page":page_num+1,
                    "total_pages":len(pdf_docs),
                    "chunk_method" : "smart pdf processor",
                    "char_count":len(cleaned_text)
                    }]
            )
            processed_chunks.extend(chunks)
        return processed_chunks
    
    def clean_text(self,text:str)->str:
        cleaned_text = " ".join(text.split())
        cleaned_text = cleaned_text.replace("ﬁ","fi")
        cleaned_text = cleaned_text.replace("ﬂ","fl")
        return cleaned_text
            

    

In [21]:
processor = SmartPDFProcessor()

In [24]:
try:
    smart_pdf = processor.process_pdf("data/pdf/sample.pdf")
    print(f"Smart pdf has {len(smart_pdf)} documents")
except Exception as e:
    print(f"Error processing pdf: {e}")

Smart pdf has 207 documents
