In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain_core.documents import Document
from typing import List
class PuneetPDFProcessor:
    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size=chunk_size,
        self.chunk_overlap=chunk_overlap,
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "],

        )

    def process_pdf(self,pdf_path:str)->List[Document]:

        loader=PyPDFLoader(pdf_path)
        pages=loader.load()

        processed_chunks=[]

        for page_num,page in enumerate(pages):
            cleaned_text=self._clean_text(page.page_content)

            if len(cleaned_text.strip()) < 50:
                continue

            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )
            
            processed_chunks.extend(chunks)

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        
        text = " ".join(text.split())
        
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text


In [4]:
preprocessor=PuneetPDFProcessor()
preprocessor

<__main__.PuneetPDFProcessor at 0x1a364b46b30>

In [7]:
# Process a PDF if available
try:
    smart_chunks=preprocessor.process_pdf("../documents/dummy_pdf.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    if smart_chunks:
        print("\nSample chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f"  {key}: {value}")

except Exception as e:
    print(f"Processing error: {e}")

Processed into 14 smart chunks

Sample chunk metadata:
  producer: Skia/PDF m142 Google Docs Renderer
  creator: PyPDF
  creationdate: 
  title: dummy_text_projects
  source: ../documents/dummy_pdf.pdf
  total_pages: 5
  page: 1
  page_label: 1
  chunk_method: smart_pdf_processor
  char_count: 2059
