# PDF Parsing Libraries

In [1]:
from thefuzz import fuzz
from pathlib import Path
from pypdf import PdfReader
from pdfminer.high_level import extract_text
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1
import pymupdf
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

In [2]:
PDF_FILES_PATH = Path.cwd() / '../data/raw/test pdfs'
GROUND_TRUTHS = Path.cwd() /  Path.cwd() / '../../data/raw/ground truth'
LIBRARY_OUTPUT = Path.cwd() / '../../data/text extractions'
MANUAL_PDF = Path.cwd() / '../../data/raw/ground truth'

### Functions Testing

In [29]:
def pypdf_txt_extraction(file_path: Path):
    """
    Helper function to extract text from a pdf using pypdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    file = PdfReader(file_path)
    time_list = []

    for page_num in range(len(file.pages)):
        start = time.time()
        extracted_txt = file.pages[page_num].extract_text()
        if extracted_txt:  # Avoiding None values
            pdf_dict[page_num] = extracted_txt
        end = time.time()
        time_list.append((page_num, end-start))
    return pdf_dict, time_list


In [10]:
def pymupdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pymupdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    file = pymupdf.open(file_path)
    pdf_dict = {}
    time_list = []
    for page_num in range(file.page_count):
        start = time.time()
        text = file[page_num].get_text()
        end = time.time()
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
        time_list.append(end-start)
    return pdf_dict, time_list

In [11]:
def pdfminersix_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pdfminer.six page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    time_list = []
    with open(file_path, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        pages = resolve1(doc.catalog['Pages'])
        pages_count = pages.get('Count', 0)
    for page_num in range(pages_count):
        start = time.time()
        text = extract_text(file_path, page_numbers=[page_num])
        end = time.time()
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
        time_list.append(end-start)
    return pdf_dict, time_list

In [35]:
def extract_from_multiple_pdfs(directory_path: Path, library_name: str, func: callable):
    """
    Extract text from all PDF files in the specified directory using the specified library.

    Args:
        directory_path: Path of the directory containing PDF files
        library_name: Name of the PDF extraction library
        func: Function to extract text from a PDF file

    Returns:
        list: A list of dictionaries with 'PDF_ID', 'Extracted_txt_[library_name]', and 'timelist'.
    """
    all_results = []

    # Get all PDFs in the given directory
    pdf_files = list(directory_path.glob('*.pdf'))

    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")

        # Extract text using the given function
        pdf_dict, time_list = func(pdf_file)

        # Convert time_list to a dictionary for safer lookup
        time_dict = {page: time_taken for page, time_taken in time_list}

        # Add multiple rows for each page of the PDF
        for page_num, extracted_txt in pdf_dict.items():
            all_results.append({
                'PDF_ID': pdf_file.name,
                'Page_Number': page_num,
                f'Extracted_txt_{library_name}': extracted_txt,
                'timelist': time_dict.get(page_num, None)  # ✅ Corrected lookup
            })

    return all_results


In [34]:
def pypdf_txt_extraction(file_path: Path):
    """
    Helper function to extract text from a PDF using PyPDF page-wise.

    Args:
        file_path: Path of the PDF file.

    Returns:
        dict: key = page number, value = extracted page text.
        list: list of tuples (page number, extraction time).
    """
    pdf_dict = {}
    time_list = []

    try:
        file = PdfReader(file_path)

        # 🛑 Check if the PDF is encrypted
        if file.is_encrypted:
            print(f"Warning: {file_path.name} is encrypted and cannot be read.")
            return {}, []

        # 🛑 Check if the PDF has pages
        num_pages = len(file.pages)
        if num_pages == 0:
            print(f"Warning: {file_path.name} has no pages.")
            return {}, []

        for page_num in range(num_pages):
            start = time.time()
            try:
                # Ensure page_num is valid
                if page_num >= num_pages:
                    print(f"Error: Trying to access page {page_num}, but {file_path.name} only has {num_pages} pages.")
                    continue

                extracted_txt = file.pages[page_num].extract_text()
                if extracted_txt:  # Avoiding None values
                    pdf_dict[page_num] = extracted_txt
            except Exception as e:
                print(f"Error extracting text from {file_path.name}, page {page_num}: {e}")
            
            end = time.time()
            time_list.append((page_num, end - start))

    except Exception as e:
        print(f"Error opening {file_path.name}: {e}")
        return {}, []

    return pdf_dict, time_list

In [52]:
pypdf_txt_extraction(PDF_FILES_PATH / 'chess_pdf.pdf')

({0: 'Author: Prateek   \n \n  1 \n \n \n       Checkmate Through Time: The Global \nConquest of Chess \nFrom Chaturanga to Chess: The Birth of a Legend ................................ ...........................  2 \nThe European Evolution: Pawns Get a Boost ................................ ................................ ... 2 \nCoffee House Chess: The Game Goes Public ................................ ..............................  2 \nThe Birth of Competitive Chess ................................ ................................ .....................  3 \nChess in the Modern Era: From Grandmasters to Computers ................................ .......... 3 \n \n  \n',
  1: "Author: Prateek   \n \n  2 \n \nChess, the game of kings and queens, has captivated minds for centuries. From its \nhumble beginnings in ancient India to its current status as a global phenomenon, \nchess has evolved into a complex and fascinating pursuit. Let's embark on a journey \nthrough time to explore th

In [53]:
def pypdf_txt_extraction(file_path: Path):
    """
    Helper function to extract text from a pdf using pypdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    file = PdfReader(file_path)
    time_list = []

    for page_num in range(len(file.pages)):
        start = time.time()
        extracted_txt = file.pages[page_num].extract_text()
        if extracted_txt:  # Avoiding None values
            pdf_dict[page_num] = extracted_txt
        end = time.time()
        time_list.append((page_num, end-start))
    return pdf_dict, time_list


In [55]:
pymupdf_txt_extraction(PDF_FILES_PATH / 'matplotlib_41-61_pdf.pdf')

({'matplotlib_41-61_pdf.pdf ~ 0': 'Matplotlib, Release 2.0.2\n0\n10\n20\n30\n40\n50\n60\n0\n10\n20\n30\n40\nIn [21]: imgplot = plt.imshow(img, interpolation="bicubic")\n3.1. Introductory\n33\n',
  'matplotlib_41-61_pdf.pdf ~ 1': 'Matplotlib, Release 2.0.2\n0\n10\n20\n30\n40\n50\n60\n0\n10\n20\n30\n40\nBicubic interpolation is often used when blowing up photos - people tend to prefer blurry over pixelated.\n3.1.3 Customizing Location of Subplot Using GridSpec\nGridSpec speciﬁes the geometry of the grid that a subplot will be placed. The number of\nrows and number of columns of the grid need to be set. Optionally, the subplot layout\nparameters (e.g., left, right, etc.) can be tuned.\nSubplotSpec speciﬁes the location of the subplot in the given GridSpec.\nsubplot2grid() a helper function that is similar to subplot() but uses 0-based indexing\nand let subplot to occupy multiple cells.\nBasic Example of using subplot2grid\nTo use subplot2grid(), you provide geometry of the grid and the lo

In [50]:
results = extract_from_multiple_pdfs(PDF_FILES_PATH, 'pypdf', pypdf_txt_extraction)
pd.DataFrame(results)

Processing file: matplotlib_41-61_pdf.pdf


UnboundLocalError: cannot access local variable 'v' where it is not associated with a value

In [None]:
pd.DataFrame(results).transpose()

Unnamed: 0,Extracted_txt_pypdf,timelist
matplotlib_41-61_pdf.pdf,"{0: 'Matplotlib, Release 2.0.2 0 10 20 30 40 5...","[(0, 0.01674795150756836), (1, 0.0066199302673..."
matplotlib_61-81_pdf.pdf,"{0: 'Matplotlib, Release 2.0.2 1.01.52.0y-labe...","[(0, 0.007433891296386719), (1, 0.007052183151..."
latex_pdf.pdf,{0: 'Sample PDF Document RobertMaron Grzegorz ...,"[(0, 0.0006620883941650391), (1, 0.00018310546..."
basketball_pdf.pdf,{0: 'The Global Game: Basketball's Unifying Po...,"[(0, 0.005740165710449219), (1, 0.022172927856..."
notion_pdf.pdf,{0: 'Gen AI Intr oduction  Not es 1G e n A I...,"[(0, 0.005805015563964844), (1, 0.001660823822..."
matplotlib_1-21_pdf.pdf,"{0: 'Matplotlib Release 2.0.2 John Hunter, Dar...","[(0, 0.001112222671508789), (1, 5.197525024414..."
chess_pdf.pdf,{0: 'Author: Prateek 1 Chec...,"[(0, 0.004670143127441406), (1, 0.008870840072..."
sleep_pdf.pdf,{0: ' Sleep Deprivation: Your Body's Silent ...,"[(0, 0.006541252136230469), (1, 0.006972789764..."
matplotlib_81-101_pdf.pdf,"{0: 'Matplotlib, Release 2.0.2 All of this ﬂex...","[(0, 0.00846409797668457), (1, 0.0056309700012..."
genai_pdf.pdf,{0: ' Generative AI: Your New Digi...,"[(0, 0.009430170059204102), (1, 0.008116960525..."


---