### Loading PDF using two libraries PyPDFLoader and PyMuPDFLoader

In [None]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
# PyMuPDFLoader is required when the pdf have images or complex formatting it is also faster than PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Using PyPDFLoader to load PDF document...")
loader = PyPDFLoader("data/pdffiles/Dimensionality Reduction for Network KPIs.pdf")
documents = loader.load()
print(f"Number of pages loaded: {len(documents)}")
print(documents[0].page_content[:500])  # Print first 500 characters of the first page
print("Metadata:", documents[0].metadata)

Using PyPDFLoader to load PDF document...
Number of pages loaded: 10
sensors
Letter
Feature Extraction for Dimensionality Reduction in
Cellular Networks Performance Analysis
Isabel de-la-Bandera 1,*
 , David Palacios 2, Jessica Mendoza 1
 and Raquel Barco 1
1 Department of Communications Engineering, University of M√°laga, 29071 M√°laga, Spain;
jmr@ic.uma.es (J.M.); rbm@ic.uma.es (R.B.)
2 Tupl Spain S.L., Tupl Inc., 29010 M√°laga, Spain; david.palacios@tupl.com
* Correspondence: ibanderac@ic.uma.es
Received: 13 October 2020; Accepted: 1 December 2020; Published: 4 D
Metadata: {'producer': 'pdfTeX-1.40.18', 'creator': 'LaTeX with hyperref package', 'creationdate': '2020-12-04T20:20:45+08:00', 'author': 'I. de-la-Bandera, D. Palacios, J. Mendoza and R. Barco', 'title': 'Feature Extraction for Dimensionality Reduction in Cellular Networks Performance Analysis', 'subject': "Next-generation mobile communications networks will have to cope with an extraordinary amount and variety of network 

In [4]:
print("Using PyMuPDFLoader to load PDF document...")
loader = PyMuPDFLoader("data/pdffiles/Dimensionality Reduction for Network KPIs.pdf")
documents = loader.load()
print(f"Number of pages loaded: {len(documents)}")
print(documents[0].page_content[:500])  # Print first 500 characters of the first page
print("Metadata:", documents[0].metadata)

Using PyMuPDFLoader to load PDF document...
Number of pages loaded: 10
sensors
Letter
Feature Extraction for Dimensionality Reduction in
Cellular Networks Performance Analysis
Isabel de-la-Bandera 1,*
, David Palacios 2, Jessica Mendoza 1
and Raquel Barco 1
1
Department of Communications Engineering, University of M√°laga, 29071 M√°laga, Spain;
jmr@ic.uma.es (J.M.); rbm@ic.uma.es (R.B.)
2
Tupl Spain S.L., Tupl Inc., 29010 M√°laga, Spain; david.palacios@tupl.com
*
Correspondence: ibanderac@ic.uma.es
Received: 13 October 2020; Accepted: 1 December 2020; Published: 4 Dec
Metadata: {'producer': 'pdfTeX-1.40.18', 'creator': 'LaTeX with hyperref package', 'creationdate': '2020-12-04T20:20:45+08:00', 'source': 'data/pdffiles/Dimensionality Reduction for Network KPIs.pdf', 'file_path': 'data/pdffiles/Dimensionality Reduction for Network KPIs.pdf', 'total_pages': 10, 'format': 'PDF 1.5', 'title': 'Feature Extraction for Dimensionality Reduction in Cellular Networks Performance Analysis', 'autho

In [24]:
import os
import re
import unicodedata
import fitz
import json
from PIL import Image
from io import BytesIO
import uuid
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

def deep_clean_text(text: str) -> str:
 # 1Ô∏è‚É£ Remove control characters (U+0000 - U+001F, except newline and tab)
    text = re.sub(r'[\x00-\x08\x0b-\x1f\x7f-\x9f]', '', text)

    # 2Ô∏è‚É£ Normalize Unicode (e.g. accents, special glyphs)
    text = unicodedata.normalize("NFKC", text)

    # 3Ô∏è‚É£ Replace non-breaking spaces with normal spaces
    text = text.replace('\xa0', ' ')

    # 4Ô∏è‚É£ Fix linebreak hyphenations: e.g. "connec-\ntion" ‚Üí "connection"
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # 5Ô∏è‚É£ Merge broken lines but keep paragraph breaks
    text = re.sub(r'\n{2,}', '\n\n', text)   # preserve double newlines
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # single newline ‚Üí space

    # 6Ô∏è‚É£ Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # 7Ô∏è‚É£ Remove residual LaTeX junk (if any)
    text = re.sub(r'\\(begin|end)\{.*?\}', '', text)
    text = re.sub(r'\\[a-zA-Z]+\s*', '', text)

    # 8Ô∏è‚É£ Trim whitespace
    text = text.strip()

    return text


#Class for loading PDF with image extraction and text cleaning -> Document loading step
class SmartPDFLoader:
    def __init__(self, 
                 file_path, 
                 extract_images=True, 
                 save_image_dir="pdf_images", 
                 save_to_json=True, 
                 output_dir="json_doc_output"):
        """
        Hybrid Smart PDF Loader with:
        - Fallback loading (PyMuPDFLoader ‚Üí PyPDFLoader)
        - Text cleaning
        - Image extraction
        - Structure-aware + Recursive chunking
        """
        self.file_path = file_path
        self.extract_images = extract_images
        self.save_image_dir = save_image_dir
        self.save_to_json = save_to_json
        self.output_dir = output_dir

        os.makedirs(save_image_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)

    # -------------------------------------------------------------------------
    # TEXT CLEANING
    # -------------------------------------------------------------------------
    def clean_text(self, text: str) -> str:
        """Deep clean PDF text: remove control chars, fix spaces, normalize."""
        text = deep_clean_text(text)
        return text

    # -------------------------------------------------------------------------
    # IMAGE EXTRACTION
    # -------------------------------------------------------------------------
    def extract_images_from_pdf(self) -> list:
        """Extracts all images with metadata."""
        images = []
        pdf_doc = fitz.open(self.file_path)

        for page_idx, page in enumerate(pdf_doc, start=1):
            image_list = page.get_images(full=True)
            for img_idx, img in enumerate(image_list, start=1):
                xref = img[0]
                base_image = pdf_doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image = Image.open(BytesIO(image_bytes))

                image_filename = f"page_{page_idx}_img_{img_idx}.{image_ext}"
                image_path = os.path.join(self.save_image_dir, image_filename)
                image.save(image_path)

                images.append({
                    "page": page_idx,
                    "path": image_path,
                    "width": image.width,
                    "height": image.height,
                    "ext": image_ext
                })
        return images

    # -------------------------------------------------------------------------
    # STRUCTURE-AWARE + RECURSIVE CHUNKER
    # -------------------------------------------------------------------------
    def structure_aware_chunk(self, docs, max_chars=1200, overlap=150):
        """Hybrid structure-based and recursive text splitting."""
        structured_chunks = []
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chars,
            chunk_overlap=overlap,
            separators=["\n\n", "\n", ".", " "],
        )

        # Section headers ‚Äî suitable for scientific PDFs
        section_pattern = re.compile(
            r'(?i)(?:^|\n)(abstract|introduction|background|related work|methodology|materials|'
            r'implementation|approach|results|discussion|conclusion|performance analysis|experiment setup|'
            r'results and discussion|references|appendix|\d+\.\s[A-Z].*?)(?=\n)',
            re.MULTILINE
        )

        for doc in docs:
            text = doc.page_content
            meta = doc.metadata.copy()

            # Split by major sections
            sections = []
            last_idx = 0
            for match in section_pattern.finditer(text):
                start = match.start()
                if start > last_idx:
                    section_text = text[last_idx:start].strip()
                    if section_text:
                        sections.append(section_text)
                last_idx = start
            if last_idx < len(text):
                sections.append(text[last_idx:].strip())

            # Process each section
            for section_text in sections:
                header_match = re.match(section_pattern, section_text)
                section_title = header_match.group(0).strip() if header_match else "General"
                section_body = section_text[len(section_title):].strip()

                sub_doc = [Document(page_content=section_body, metadata={**meta, "section": section_title})]
                split_docs = splitter.split_documents(sub_doc)

                for sd in split_docs:
                    sd.metadata["chunk_id"] = str(uuid.uuid4())
                    structured_chunks.append(sd)

        return structured_chunks

    # -------------------------------------------------------------------------
    # SAVE TO JSON
    # -------------------------------------------------------------------------
    def save_to_json_file(self, documents):
        json_data = [{"page_content": d.page_content, "metadata": d.metadata} for d in documents]
        json_filename = os.path.splitext(os.path.basename(self.file_path))[0] + ".json"
        json_path = os.path.join(self.output_dir, json_filename)
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=4)
        print(f"‚úÖ JSON saved: {json_path}")
        return json_path

    # -------------------------------------------------------------------------
    # MAIN LOAD METHOD
    # -------------------------------------------------------------------------
    def load(self):
        """Load ‚Üí clean ‚Üí structure-aware chunking ‚Üí return list of Document objects."""
        try:
            print("üîπ Trying PyMuPDFLoader...")
            loader = PyMuPDFLoader(self.file_path)
            documents = loader.load()
        except Exception as e:
            print(f"‚ö†Ô∏è PyMuPDFLoader failed: {e}. Using PyPDFLoader instead.")
            loader = PyPDFLoader(self.file_path)
            documents = loader.load()

        print(f"‚úÖ Loaded {len(documents)} raw text chunks.")

        # Clean text
        cleaned_docs = []
        for d in documents:
            clean_content = self.clean_text(d.page_content)
            if clean_content:
                cleaned_docs.append(Document(page_content=clean_content, metadata=d.metadata))

        # Extract images
        image_docs = []
        if self.extract_images:
            imgs = self.extract_images_from_pdf()
            for img_data in imgs:
                image_docs.append(Document(page_content="[IMAGE]", metadata={"type": "image", **img_data}))

        # Chunk the text docs
        chunked_docs = self.structure_aware_chunk(cleaned_docs)
        print(f"‚úÖ Chunked into {len(chunked_docs)} sections.")

        final_docs = chunked_docs + image_docs

        # Save cleaned version (optional)
        if self.save_to_json:
            self.save_to_json_file(chunked_docs)

        return final_docs

In [None]:
documents=SmartPDFLoader("data/pdffiles/Dimensionality Reduction for Network KPIs.pdf").load()
print(f"Total chunks loded (text): {len(documents)}")
for chunks in documents:
    if chunks.page_content != "[IMAGE]":
        print(f"Text Document Content (first 100 chars): {chunks.page_content[:100]}")
        print(f"Metadata: {len(chunks.page_content)}")
        

üîπ Trying PyMuPDFLoader...
‚úÖ Loaded 10 raw text chunks.
‚úÖ Chunked into 37 sections.
‚úÖ JSON saved: json_doc_output\Dimensionality Reduction for Network KPIs.json
Total documents loaded (text + images): 49
Text Document Content (first 100 chars): Letter Feature Extraction for Dimensionality Reduction in Cellular Networks Performance Analysis Isa
Metadata: 1124
Text Document Content (first 100 chars): . Results using a dataset gathered from a live cellular network show the benefits of this approach, 
Metadata: 991
Text Document Content (first 100 chars): . To know the current network state (e.g., whether the network behavior is sub-optimal or degraded),
Metadata: 1089
Text Document Content (first 100 chars): . To avoid these problems, an efficient selection of a set of KPIs should be carried out in order to
Metadata: 572
Text Document Content (first 100 chars): 2020, 20, 6944 2 of 10 that is traditionally selected could not be the most suitable one for some sp
Metadata: 1132
Text 

## Custom Chunking