In [8]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document

class SmartPDFProcessor:
    """Advanced PDF processing with error handling and data cleaning"""

    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            separators=["\n\n", "\n", " ", ""] 
        )

    def process_pdf(self, pdf_path: str) -> List[Document]:
        """Process PDF with smart chunking and metadata enrichment"""

        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        print(f"PyPDFLoader found {len(pages)} pages.") 

        processed_chunks = []

        for page_num, page in enumerate(pages):
        
            cleaned_text = self.cleanText(page.page_content)
             
            print(f"page details {page} page content {page.page_content}")
            print(f"Page {page_num + 1} char count: {len(cleaned_text.strip())}")

            # SKIP if empty, otherwise process
            if len(cleaned_text.strip()) < 10:
                continue  # Skip this page

           
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "source": pdf_path,
                    "chunk_method": "SmartPDFProcesor",
                    "character_count": len(cleaned_text)
                }]
            )
            processed_chunks.extend(chunks)

        return processed_chunks

    # This is now a method of the class (added 'self')
    def cleanText(self, text):
        """Cleans up text by removing extra spaces and fixing ligatures."""
        text = " ".join(text.split())  # Remove extra spaces

        # 5. Fixed ligatures 
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")

        return text

# --- Main execution ---
preprocessor = SmartPDFProcessor()

# Process a PDF file
try:

    chunks = preprocessor.process_pdf("data/pdf/data.pdf")
    print(f"Created {len(chunks)} chunks")


except Exception as e:
    print(f"Error processing PDF: {e}")

PyPDFLoader found 211 pages.
page details page_content='' metadata={'producer': 'jsPDF 2.4.0', 'creator': 'PyPDF', 'creationdate': '2023-06-16T12:36:28+05:30', 'source': 'data/pdf/data.pdf', 'total_pages': 211, 'page': 0, 'page_label': '1'} page content 
Page 1 char count: 0
page details page_content='' metadata={'producer': 'jsPDF 2.4.0', 'creator': 'PyPDF', 'creationdate': '2023-06-16T12:36:28+05:30', 'source': 'data/pdf/data.pdf', 'total_pages': 211, 'page': 1, 'page_label': '2'} page content 
Page 2 char count: 0
page details page_content='' metadata={'producer': 'jsPDF 2.4.0', 'creator': 'PyPDF', 'creationdate': '2023-06-16T12:36:28+05:30', 'source': 'data/pdf/data.pdf', 'total_pages': 211, 'page': 2, 'page_label': '3'} page content 
Page 3 char count: 0
page details page_content='' metadata={'producer': 'jsPDF 2.4.0', 'creator': 'PyPDF', 'creationdate': '2023-06-16T12:36:28+05:30', 'source': 'data/pdf/data.pdf', 'total_pages': 211, 'page': 3, 'page_label': '4'} page content 
Page