# Document loading

In [1]:
import pymupdf # For pdf loading
from langchain_core.documents import Document # For converting elements into Document .
from pathlib import Path # For Loading the directory or PDF path .
from typing import List
import traceback # Used for error handling .

In [2]:
# Function to load all the PDF's from Directory .
def loading_pdf(dir_path:str='../data/pdf')->List[Document]: # Return type .
    dir_path=Path(dir_path) # Loading directory Path

    if not dir_path.is_dir(): # Checking if the directory is valid .
        raise NotADirectoryError(f"{dir_path} is a invalid directory .")

    print(f"The directory path is : {dir_path} .")
    docs=list(dir_path.rglob("*.pdf")) # Storing all the PDF's path into list .
    print(f"Number of PDF's in directory is {len(docs)}")

    if len(docs)==0: # Checking if any PDFs exists in the directory .
        print('No documents in the dir_path')
        return []

    # All these variables used for stats check at the end .
    all_pdf_size=0.0
    all_documents=[]
    failed_pdf=[]

    print("="*20,"PDF LOAD SUMMARY","="*20)
    print("-"*45)

    for serial,pdf_path in enumerate(docs): # Iterating through all the PDFs in directory .
        print(f"{serial+1} ---> Loading {pdf_path.name} ")
        pdf_size_bytes=pdf_path.stat().st_size
        pdf_size_mb=pdf_size_bytes/(1024**2) # Calculating size of the PDF .
        print(f"File size : {pdf_size_mb:.3f} MB")
        try:
            imag_dir= Path('../data/images_pymupdf') / pdf_path.stem # Directory for storing images in the PDF .
            imag_dir.mkdir(parents=True,exist_ok=True)
            pdf=pymupdf.open(filename=pdf_path,filetype="pdf") # Loading PDF .

            # The following function is to remove smaller texts which are not much useful while chunking or embedding .
            def flush(page_num)->None:

                if not text_blocks:
                    return
                document=Document(
                    page_content="\n".join(b["text"] for b in text_blocks), # Page content for the Document .
                    metadata={
                    'source':pdf_path.name,
                    'page_num':page_num,
                    'text_blocks':text_blocks.copy(),
                    'images':page_images.copy(),
                    }
                    ) # Relevant metadata .

                all_documents.append(document) # Storing Documents for next process .
                text_blocks.clear()

            for page_num,page in enumerate(pdf,start=1):
                text_blocks=[] # Used for storing details about block of a page .
                page_images=[] # Used for storing images of current page .

                for img_index,img in enumerate(page.get_images(full=True)): # Extracting images .

                    if img[1]!=0: # Removing smask . smask -> Transparency layer
                        continue
                    xref=img[0]

                    rects=page.get_image_rects(xref) # Used for getting image edges .

                    if not rects: # Checking if coordinates or image is empty .
                        continue

                    pix=pymupdf.Pixmap(pdf,xref) # xref is used to find position if image in PDF .

                    if pix.width<50 or pix.height<50: # Removing very tiny images .
                        pix=None
                        continue
                    if pix.alpha and pix.samples is not None: # Removing fully transparent images .
                        if max(pix.samples)==0:
                            continue

                    if pix.n>4: # If image is CMY color format converting it into RGB .
                        pix=pymupdf.Pixmap(pymupdf.csRGB,pix)


                    img_path=imag_dir/f"page_{page_num}_img_{img_index}.png" # Location for storing images in local disk .
                    pix.save(img_path) # Saving images in local disk .
                    pix=None

                    for rect in rects:
                        page_images.append({
                            "image_id":f"{pdf_path.stem}_p{page_num}_i{img_index}",
                            "path":str(img_path),
                            "page":page_num,
                            "bbox":[rect.x0,rect.y0,rect.x1,rect.y1]
                        }) # For metadata .

                # Following loop is to extract texts from a page .
                blocks=sorted(page.get_text("blocks"),key=lambda b:(b[1],b[0]))
                for block_id,b in enumerate(blocks):
                    x0,y0,x1,y1,text=b[:5] # Coordinates and text of text block .
                    text=text.strip()
                    if len(text) < 20: # If texts are smaller it is removed since smaller texts can not be very useful .
                        continue
                    block_bbox=[x0,y0,x1,y1] # Coordinates of the textblocks . Used while checking relevance of image and text .

                    text_blocks.append({
                        "block_id":block_id,
                        "text":text,
                        "bbox":block_bbox,
                        "page":page_num,
                    }) # Used while appending metadata .

                flush(page_num) # Since remaining text at end of the page may remain without being used so using flush to make a Document .

            all_pdf_size+=pdf_size_mb
            pdf.close()
            print("-"*45)
        except Exception as e:
            print(f" Error loading {pdf_path.name} . Error {e}") # Exception handling .
            failed_pdf.append(pdf_path.name) # Storing the PDF failed to load .
            traceback.print_exc() # Used to trace failures similar to python interpreter stack trace .

    # Some stats of Loading all the PDF in a directory .
    print(f"Total size of all the PDF's are : {all_pdf_size:.3f} MB")
    print(f"Total Number of Documents : {len(all_documents)} ")
    print("-"*45)

    # Printing all the PDF which where not able to load .
    if failed_pdf:
        print(f"Failed PDF : {failed_pdf}")

    return all_documents # Returning the loaded Documents . Type -> List of Document

# Chunking

In [3]:
from langchain_core.documents import Document # Datatype of a block or a chunk .
from typing import List # Used to store list of Documents or to specify return type .

In [4]:
# The following function is to calculate distance between two blocks and a threshold is set such that if two blocks are far those both blocks are separated with different chunks .
def vertical_gap(block1,block2)->float:
    return block2["bbox"][1]-block1["bbox"][3] # Distance between bottom of block 1 and top of block 2.

In [5]:
# The following function is used for getting outermost edge of all the chunks combined .
def merge_bbox(blocks):
    return(
        min(b["bbox"][0] for b in blocks), # x0 left
        min(b["bbox"][1] for b in blocks), # y0 top
        max(b["bbox"][2] for b in blocks), # x1 right
        max(b["bbox"][3] for b in blocks)  # y1 bottom1
    )

In [6]:
# The following function is used create a chunk by adding
def build_chunk(doc:Document,blocks:list,images:list)->Document:
    chunk_text="\n".join(b["text"] for b in blocks) # Combing all the texts from the blocks .
    chunk_bbox=merge_bbox(blocks) # Used to get overall chunk coordinates .

    return Document(
        page_content=chunk_text,
        metadata={
            "source":doc.metadata["source"],
            "page_num":doc.metadata["page_num"],
            "bbox":chunk_bbox,
            "text_blocks":blocks,
            "images":images
        }
    ) # Adding metadata .

In [7]:
# The following function is used to check if a image is relevant to a chunk using coordinates .
def bbox_overlap(a,b)->bool: # Here a is chunk bbox and b is image bbox. (using a and b instead of chunk and img because we might use this function for something else in future code .)
    return not(
        a[2] < b[0] or # right edge of a and left edge of b (here we are considering a is at left and b is right side of a .)
        a[0] > b[2] or # left edge of a and right edge of b (here we are considering b is at left and a is right side of b .)
        a[3] < b[1] or # bottom edge of a and top edge of b (here we are considering a is at top and b is below of a .)
        a[1] > b[3]    # top edge of a and bottom edge of b (here we are considering b is at top and a is below of b .)
    )

## Main chunking strategy

In [8]:
# The following function is main chunking strategy , it uses bbox , max characters to chunk different blocks together .
# max_chars -> maximum characters in a single chunk . (I guess we can replace with token based chunking using tiktoken need to check on that .)
# max_vertical_gap -> maximum vertical height between two blocks . Calculated using bbox .

def bbox_chunker(documents:List[Document],max_chars:int=1000,max_vertical_gap:int=40)->List[Document]:
    all_chunks=[] # Used to store chunks .

    for doc in documents:
        blocks=doc.metadata.get("text_blocks",[]) # Used to store text of a block .
        images=doc.metadata.get("images",[]) # Used to store all the images of a Document .

        if not blocks:
            continue

        current_blocks=[] # Used to store blocks to store in a chunk .
        current_len=0 # Calculating maximum characters in chunk .

        for block in blocks:
            text=block["text"]
            block_len=len(text) # Calculating characters in a single block .

            if current_blocks:
                gap=vertical_gap(current_blocks[-1],block) # Previous block and current block
            else:
                gap=0 # Basically first block of a Document .

            if current_len+block_len>max_chars or gap> max_vertical_gap:
                all_chunks.append(build_chunk(doc,current_blocks,images)) # Creating a chunk and appending it .
                current_blocks=[]
                current_len=0
            current_blocks.append(block)
            current_len+=block_len

        if current_blocks:
            all_chunks.append(
                build_chunk(doc,current_blocks,images) # if any block is missed at end of document then it is chunked separately .
            )
    print(f"{len(all_chunks)} of chunks were created using {len(documents)} documents .")
    return all_chunks

## Image relevance strategy

In [9]:
# The following function is used to attach images to relevant chunks .
def attach_images(chunks:List[Document])->List[Document]:
    for chunk in chunks: # Retrieving a chunk and its images .
        chunk_bbox=chunk.metadata["bbox"]
        page_images=chunk.metadata["images"]

        # Checking relevance of image using bbox or coordinates .
        relevant=[
            img for img in page_images
            if bbox_overlap(chunk_bbox,img["bbox"]) # If images overlaps to the chunk then image is attached to chunk .
        ]
        chunk.metadata["images"]=relevant # Adding images to metadata .

    return chunks

In [10]:
documents=loading_pdf() # Loading PDF

The directory path is : ..\data\pdf .
Number of PDF's in directory is 1
---------------------------------------------
1 ---> Loading hubble-science-highlights.pdf 
File size : 14.789 MB
---------------------------------------------
Total size of all the PDF's are : 14.789 MB
Total Number of Documents : 74 
---------------------------------------------


In [11]:
chunks=bbox_chunker(documents) # Creating chunks .

161 of chunks were created using 74 documents .


In [12]:
print(chunks[1]) # Displaying a single chunk .

page_content='HUBBLE SPACE TELESCOPE' metadata={'source': 'hubble-science-highlights.pdf', 'page_num': 2, 'bbox': (443.36920166015625, 366.5845947265625, 565.1551513671875, 379.16461181640625), 'text_blocks': [{'block_id': 0, 'text': 'HUBBLE SPACE TELESCOPE', 'bbox': [443.36920166015625, 366.5845947265625, 565.1551513671875, 379.16461181640625], 'page': 2}], 'images': [{'image_id': 'hubble-science-highlights_p2_i0', 'path': '..\\data\\images_pymupdf\\hubble-science-highlights\\page_2_img_0.png', 'page': 2, 'bbox': [39.327880859375, 25.74951171875, 581.6133422851562, 364.67791748046875]}]}


In [13]:
print(chunks[1].page_content) # Page content of a chunk .

HUBBLE SPACE TELESCOPE


In [14]:
print(chunks[1].metadata) # Metadata of a chunk .

{'source': 'hubble-science-highlights.pdf', 'page_num': 2, 'bbox': (443.36920166015625, 366.5845947265625, 565.1551513671875, 379.16461181640625), 'text_blocks': [{'block_id': 0, 'text': 'HUBBLE SPACE TELESCOPE', 'bbox': [443.36920166015625, 366.5845947265625, 565.1551513671875, 379.16461181640625], 'page': 2}], 'images': [{'image_id': 'hubble-science-highlights_p2_i0', 'path': '..\\data\\images_pymupdf\\hubble-science-highlights\\page_2_img_0.png', 'page': 2, 'bbox': [39.327880859375, 25.74951171875, 581.6133422851562, 364.67791748046875]}]}


In [15]:
final_chunks=attach_images(chunks) # Attaching images to relevant chunks .

In [16]:
print(final_chunks[1]) # Displaying a single chunk .

page_content='HUBBLE SPACE TELESCOPE' metadata={'source': 'hubble-science-highlights.pdf', 'page_num': 2, 'bbox': (443.36920166015625, 366.5845947265625, 565.1551513671875, 379.16461181640625), 'text_blocks': [{'block_id': 0, 'text': 'HUBBLE SPACE TELESCOPE', 'bbox': [443.36920166015625, 366.5845947265625, 565.1551513671875, 379.16461181640625], 'page': 2}], 'images': []}


In [17]:
print(final_chunks[1].page_content) # Page content of a chunk .

HUBBLE SPACE TELESCOPE


In [18]:
print(final_chunks[0].metadata) # Metadata of a chunk .

{'source': 'hubble-science-highlights.pdf', 'page_num': 1, 'bbox': (175.6049041748047, 471.7757263183594, 503.6419677734375, 756.75), 'text_blocks': [{'block_id': 0, 'text': 'Reshaping Our \nCosmic View', 'bbox': [218.0, 471.7757263183594, 503.6419677734375, 570.6857299804688], 'page': 1}, {'block_id': 1, 'text': 'H U B B L E  S P A C E  T E L E S C O P E', 'bbox': [175.6049041748047, 489.989990234375, 195.92990112304688, 756.75], 'page': 1}, {'block_id': 2, 'text': 'Hubble Science Highlights', 'bbox': [221.0, 570.6558837890625, 483.0419921875, 600.4658813476562], 'page': 1}], 'images': [{'image_id': 'hubble-science-highlights_p1_i0', 'path': '..\\data\\images_pymupdf\\hubble-science-highlights\\page_1_img_0.png', 'page': 1, 'bbox': [-1.0525050163269043, -1.1285400390625, 613.0826416015625, 793.124755859375]}]}


In [19]:
# Used for json data manipulation .
import json

In [20]:
print(json.dumps(final_chunks[0].metadata,indent=2,sort_keys=True)) # Just a pretty print of the metadata .

{
  "bbox": [
    175.6049041748047,
    471.7757263183594,
    503.6419677734375,
    756.75
  ],
  "images": [
    {
      "bbox": [
        -1.0525050163269043,
        -1.1285400390625,
        613.0826416015625,
        793.124755859375
      ],
      "image_id": "hubble-science-highlights_p1_i0",
      "page": 1,
      "path": "..\\data\\images_pymupdf\\hubble-science-highlights\\page_1_img_0.png"
    }
  ],
  "page_num": 1,
  "source": "hubble-science-highlights.pdf",
  "text_blocks": [
    {
      "bbox": [
        218.0,
        471.7757263183594,
        503.6419677734375,
        570.6857299804688
      ],
      "block_id": 0,
      "page": 1,
      "text": "Reshaping Our \nCosmic View"
    },
    {
      "bbox": [
        175.6049041748047,
        489.989990234375,
        195.92990112304688,
        756.75
      ],
      "block_id": 1,
      "page": 1,
      "text": "H U B B L E  S P A C E  T E L E S C O P E"
    },
    {
      "bbox": [
        221.0,
        570.6558837