In [1]:
#Libraries used
from unstructured.partition.pdf import partition_pdf # For conversion of PDF into elements .
from langchain_core.documents import Document # For converting elements into Document .
from pathlib import Path # For Loading the directory or PDF path .
from typing import List
import traceback # Used for error handling .

In [2]:
# Function to load all the PDF's from Directory .
def loading_pdf(dir_path:str='../data/pdf')->List[Document]: # Return type .
    dir_path=Path(dir_path) # Loading directory Path

    if not dir_path.is_dir(): # Checking if the directory is valid .
        raise NotADirectoryError(f"{dir_path} is a invalid directory .")

    print(f"The directory path is : {dir_path} .")
    docs=list(dir_path.rglob("*.pdf")) # Storing all the PDF's path into list .
    print(f"Number of PDF's in directory is {len(docs)}")

    if len(docs)==0: # Checking if any PDFs exists in the directory .
        print('No documents in the dir_path')
        return []

    # All these variables used for stats check at the end .
    all_pdf_size=0.0
    all_elements=[]
    all_documents=[]
    failed_pdf=[]

    print("="*20,"PDF LOAD SUMMARY","="*20)
    print("-"*45)

    for serial,pdf_path in enumerate(docs): # Iterating through all the PDFs in directory .
        print(f"{serial+1} ---> Loading {pdf_path.name} ")
        pdf_size_bytes=pdf_path.stat().st_size
        pdf_size_mb=pdf_size_bytes/(1024**2) # Calculating size of the PDF .
        print(f"File size : {pdf_size_mb:.3f} MB")
        try:
            imag_dir= Path('../data/images') / pdf_path.stem # Directory for storing images in the PDF .
            imag_dir.mkdir(parents=True,exist_ok=True)
            # This function automatically detects the layout, detect texts , extract images and other required content in the PDF .
            # filename -> Name or path of the PDF .
            # extract_images_in_pdf -> Used to extract images in PDF and store it in given Directory or default stores in base64 and stores in metadata and also if extract_images_in_pdf works only is strategy is True .
            # infer_table_structure -> Used for better table structure and this works if strategy is hi_res.
            # strategy -> This is used for layout detection . Note - Different strategy techniques also requires different cpu computation time .
            # image_output_dir_path -> Used for redirecting images into a directory or file .
            # languages -> Used to specify the language the PDF is written.
            # Note - The following function requires Poppler (for PDF rendering) and Tesseract (for image based pdf for converting text into actual text ,since in image based PDF no texts can be extracted .)
            # Links to download
            # -> https://github.com/oschwartz10612/poppler-windows
            # -> https://github.com/tesseract-ocr/tesseract
            pdf=partition_pdf(
                              filename=str(pdf_path),
                              extract_images_in_pdf=True,
                              infer_table_structure=True,
                              strategy='hi_res',
                              image_output_dir_path=imag_dir, # Fix this issue the image is not storing in intended directory .
                              languages=['eng']
                              )

            for element in pdf:
            # Used to convert the elements containing text into Documents and also remove the elements with minimum texts .
                if element.text and len(element.text.strip())>20:
                    all_documents.append(
                        Document(
                            page_content=element.text,
                            # Adding metadata .
                            metadata={
                                **element.metadata.to_dict() # Expands the dictionary into new one .
                                }
                        )
                    )

            print(f"Total Elements in the PDF : {len(pdf)}") # Printing elements in a single pdf .
            all_pdf_size+=pdf_size_mb
            all_elements.extend(pdf) # Storing elements of PDF .
            print("-"*45)
        except Exception as e:
            print(f" Error loading {pdf_path.name} . Error {e}") # Exception handling .
            failed_pdf.append(pdf_path.name) # Storing the PDF failed to load .
            traceback.print_exc() # Used to trace failures similar to python interpreter stack trace .

    # Some stats of Loading all the PDF in a directory .
    print(f"Total size of all the PDF's are : {all_pdf_size:.3f} MB")
    print(f"Total Elements from all the PDF's : {len(all_elements)}")
    print(f"Total Number of Documents : {len(all_documents)} ")
    print("-"*45)

    # Printing all the PDF which where not able to load .
    if failed_pdf:
        print(f"Failed PDF : {failed_pdf}")

    return all_documents # Returning the loaded Documents . Type -> List of Document

In [3]:
documents=loading_pdf() # Loading PDF

The directory path is : ..\data\pdf .
Number of PDF's in directory is 1
---------------------------------------------
1 ---> Loading hubble-science-highlights.pdf 
File size : 14.789 MB


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Total Elements in the PDF : 746
---------------------------------------------
Total size of all the PDF's are : 14.789 MB
Total Elements from all the PDF's : 746
Total Number of Documents : 410 
---------------------------------------------


In [4]:
print(documents[0]) # First Document of the extracted PDF elements .

page_content='Reshaping Our Cosmic View Hubble Science Highlights' metadata={'detection_class_prob': 0.35657215118408203, 'is_extracted': 'true', 'coordinates': {'points': ((np.float64(524.7530517578125), np.float64(1336.1209716796875)), (np.float64(524.7530517578125), np.float64(1678.66552734375)), (np.float64(1399.0055555555552), np.float64(1678.66552734375)), (np.float64(1399.0055555555552), np.float64(1336.1209716796875))), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2026-01-22T23:26:37', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': '..\\data\\pdf', 'filename': 'hubble-science-highlights.pdf'}


In [5]:
print(documents[0].type) # Type of the data

Document


In [6]:
print(documents[0].page_content) # Page content or basically text

Reshaping Our Cosmic View Hubble Science Highlights


In [7]:
print(documents[0].metadata) # Metadata of a Document

{'detection_class_prob': 0.35657215118408203, 'is_extracted': 'true', 'coordinates': {'points': ((np.float64(524.7530517578125), np.float64(1336.1209716796875)), (np.float64(524.7530517578125), np.float64(1678.66552734375)), (np.float64(1399.0055555555552), np.float64(1678.66552734375)), (np.float64(1399.0055555555552), np.float64(1336.1209716796875))), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2026-01-22T23:26:37', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': '..\\data\\pdf', 'filename': 'hubble-science-highlights.pdf'}
