# PDF Parsing Libraries

In [3]:
from thefuzz import fuzz
from pathlib import Path
from pypdf import PdfReader
from pdfminer.high_level import extract_text
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1
import pymupdf
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

In [70]:
PDF_FILES_PATH = Path.cwd() / '../data/raw/test pdfs'
GROUND_TRUTHS = Path.cwd() /  Path.cwd() / '../../data/raw/ground truth'
LIBRARY_OUTPUT = Path.cwd() / '../../data/text extractions'
MANUAL_PDF = Path.cwd() / '../../data/raw/ground truth'

### Functions Testing

In [23]:
for file in (Path.cwd() / '../data/raw/test pdfs').glob('*.pdf'):
    print(file)

/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/matplotlib_41-61_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/matplotlib_61-81_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/latex_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/basketball_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/notion_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/matplotlib_1-21_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/chess_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/raw/test pdfs/sleep_pdf.pdf
/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../data/

In [15]:
Path.glob?

[0;31mSignature:[0m [0mPath[0m[0;34m.[0m[0mglob[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mpattern[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mcase_sensitive[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mrecurse_symlinks[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern.
[0;31mFile:[0m      /opt/anaconda3/envs/project_chitti/lib/python3.13/pathlib/_local.py
[0;31mType:[0m      function

In [72]:
def pypdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pypdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    file = PdfReader(file_path)
    time_list = []
    for page_num in range(len(file.pages)):
        start = time.time()
        extracted_txt = file.pages[page_num].extract_text()
        end = time.time()
        pdf_dict[page_num] = extracted_txt
        time_list.append((page_num, end-start))
    return pdf_dict, time_list

In [73]:
def extract_from_multiple_pdfs(directory_path: Path):
    """
    Extract text from all PDF files in the specified directory.
    
    Args:
        directory_path: Path of the directory containing PDF files
    
    Returns:
        dict: A dictionary where the key is the PDF filename and the value is a tuple of 
              (extracted text dictionary, time taken per page list).
    """
    all_results = {}
    
    # Iterate through all the PDFs in the given directory
    for pdf_file in directory_path.glob('*.pdf'):
        pdf_dict, time_list = pypdf_txt_extraction(pdf_file)
        all_results[pdf_file.name] = (pdf_dict, time_list)
    
    return all_results


In [74]:
# Example usage:
directory_path = Path(PDF_FILES_PATH)
results = extract_from_multiple_pdfs(PDF_FILES_PATH)

UnboundLocalError: cannot access local variable 'v' where it is not associated with a value

In [69]:
PDF_FILES_PATH

PosixPath('/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../../data/raw/test pdfs')

In [68]:
results

{}

In [62]:
def extract_text_from_multiple_pdfs(func: callable, file_paths: list[Path]) -> dict[str, dict[str, str]]:
    """
    Extract text from multiple PDF files using pypdf_txt_extraction.

    Args:
        func: pypdf_txt_extraction function
        file_paths: List of paths to PDF files

    Returns:
        dict: key: file name, value: dict of page numbers and extracted text
    """
    results = {}
    for file_path in file_paths:
        file_name = file_path.name
        pdf_dict, time_list = func(file_path)
        results[file_name] = {'pdf_dict': pdf_dict, 'time_list': time_list}
    return results

In [64]:
file_paths = list((Path.cwd() / '../data/raw/test pdfs').glob('*.pdf'))
results = extract_text_from_multiple_pdfs(pypdf_txt_extraction, file_paths)
print(results)

UnboundLocalError: cannot access local variable 'v' where it is not associated with a value

In [None]:
pypdf_txt_extraction()

In [23]:
pypdf_txt_extraction(PDF_FILES_PATH)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti/analysis/../../data/raw/test pdfs'

In [4]:
def pymupdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pymupdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    file = pymupdf.open(file_path)
    pdf_dict = {}
    time_list = []
    for page_num in range(file.page_count):
        start = time.time()
        text = file[page_num].get_text()
        end = time.time()
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
        time_list.append(end-start)
    return pdf_dict, time_list

In [5]:
def pdfminersix_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pdfminer.six page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    time_list = []
    with open(file_path, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        pages = resolve1(doc.catalog['Pages'])
        pages_count = pages.get('Count', 0)
    for page_num in range(pages_count):
        start = time.time()
        text = extract_text(file_path, page_numbers=[page_num])
        end = time.time()
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
        time_list.append(end-start)
    return pdf_dict, time_list

In [6]:
pypdf_output = pypdf_txt_extraction(PDF_FILES_PATH / 'chess_pdf.pdf')
pymupdf_output = pymupdf_txt_extraction(PDF_FILES_PATH / 'chess_pdf.pdf')
pdfminer_output = pdfminersix_txt_extraction(PDF_FILES_PATH / 'chess_pdf.pdf')

In [None]:
def levenshtein_distance(ground_truth: Path, extracted):
    # Read the contents of both files
    with open(ground_truth) as file:
        ground_content = file.read()
    # Compute and return the Levenshtein distance
    return Levenshtein.distance(ground_content, extracted)

In [None]:
levenshtein_distance(ground_truth / 'chess_pdf.txt', )

In [7]:
pdf_df = pd.DataFrame({'PDF_ID': list(pypdf_output[0].keys())})
pdf_df['pypdf_txt'] = list((pypdf_output[0]).values())
pdf_df['pymupdf_txt'] = list((pymupdf_output[0]).values())
pdf_df['pdfminersix_text'] = list((pdfminer_output[0]).values())
pdf_df['pypdf_time'] = list(pypdf_output[1])
pdf_df['pymupdf_time'] = list(pymupdf_output[1])
pdf_df['pdfminersix_time'] = list(pdfminer_output[1])
pdf_df

Unnamed: 0,PDF_ID,pypdf_txt,pymupdf_txt,pdfminersix_text,pypdf_time,pymupdf_time,pdfminersix_time
0,chess_pdf.pdf ~ 0,Author: Prateek \n \n 1 \n \n \n Chec...,Author: Prateek \n \n \n \n1\n \n \n Checkmat...,Author: Prateek \n\n Checkmate Through T...,0.061482,0.049782,0.043715
1,chess_pdf.pdf ~ 1,"Author: Prateek \n \n 2 \n \nChess, the gam...","Author: Prateek \n \n \n \n2\n \nChess, the ga...","Author: Prateek \n\nChess, the game of kings a...",0.010423,0.004764,0.018143
2,chess_pdf.pdf ~ 2,Author: Prateek \n \n 3 \n \nThe Birth of C...,Author: Prateek \n \n \n \n3\n \nThe Birth of ...,Author: Prateek \n\nThe Birth of Competitive C...,0.008634,0.002915,0.018321


In [18]:
print(pdf_df['pypdf_time'].values.mean().round(5))
print(pdf_df['pymupdf_time'].values.mean().round(5))
print(pdf_df['pdfminersix_time'].values.mean().round(5))

0.02685
0.01915
0.02673


In [19]:
pdf_df['pypdf_time'].plot()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [107]:
from pathlib import Path
from PyPDF2 import PdfReader
import time

def pypdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pypdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    file = PdfReader(file_path)
    time_list = []

    for page_num in range(len(file.pages)):
        start = time.time()
        extracted_txt = file.pages[page_num].extract_text()
        if extracted_txt:  # Avoiding None values
            pdf_dict[page_num] = extracted_txt
        end = time.time()
        time_list.append((page_num, end-start))
    return pdf_dict, time_list

def extract_from_multiple_pdfs(directory_path: Path, library_name: str):
    """
    Extract text from all PDF files in the specified directory using the specified library.
    
    Args:
        directory_path: Path of the directory containing PDF files
        library_name: Name of the PDF extraction library
    
    Returns:
        list: A list of dictionaries with 'PDF_ID', 'Extracted_txt_[libraryname]', and 'timelist'.
    """
    all_results = []
    
    
    # Iterate through all the PDFs in the given directory
    pdf_files = list(directory_path.glob('*.pdf'))
    
    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")
        
        # Use the original pypdf_txt_extraction function
        pdf_dict, time_list = pypdf_txt_extraction(pdf_file)
        
        # Add multiple rows for each page of the PDF
        for page_num, extracted_txt in pdf_dict.items():
            all_results.append({
                'PDF_ID': pdf_file.name,
                'Page_Number': page_num,
                f'Extracted_txt_{library_name}': extracted_txt,
                'timelist': time_list[page_num]  # Get the time for that page
            })
    
    return all_results
# Example usage:
results = extract_from_multiple_pdfs(PDF_FILES_PATH, 'pypdf')
pd.DataFrame(results)


Processing file: matplotlib_41-61_pdf.pdf
Processing file: matplotlib_61-81_pdf.pdf
Processing file: latex_pdf.pdf
Processing file: basketball_pdf.pdf
Processing file: notion_pdf.pdf
Processing file: matplotlib_1-21_pdf.pdf
Processing file: chess_pdf.pdf
Processing file: sleep_pdf.pdf
Processing file: matplotlib_81-101_pdf.pdf
Processing file: genai_pdf.pdf
Processing file: matplotlib_21-41_pdf.pdf


Unnamed: 0,PDF_ID,Page_Number,Extracted_txt_pypdf,timelist
0,matplotlib_41-61_pdf.pdf,0,"Matplotlib, Release 2.0.2\n0 10 20 30 40 50 60...","(0, 0.007756948471069336)"
1,matplotlib_41-61_pdf.pdf,1,"Matplotlib, Release 2.0.2\n0 10 20 30 40 50 60...","(1, 0.009490013122558594)"
2,matplotlib_41-61_pdf.pdf,2,"Matplotlib, Release 2.0.2\nax = plt.subplot(2,...","(2, 0.006669759750366211)"
3,matplotlib_41-61_pdf.pdf,3,"Matplotlib, Release 2.0.2\nax = plt.subplot2gr...","(3, 0.006701946258544922)"
4,matplotlib_41-61_pdf.pdf,4,"Matplotlib, Release 2.0.2\nAdjust GridSpec lay...","(4, 0.007735013961791992)"
...,...,...,...,...
121,matplotlib_21-41_pdf.pdf,15,"Matplotlib, Release 2.0.2\n0 100 200 300 400 5...","(15, 0.0036721229553222656)"
122,matplotlib_21-41_pdf.pdf,16,"Matplotlib, Release 2.0.2\n0 100 200 300 400 5...","(16, 0.004858255386352539)"
123,matplotlib_21-41_pdf.pdf,17,"Matplotlib, Release 2.0.2\n0.0 0.2 0.4 0.6 0.8...","(17, 0.014045953750610352)"
124,matplotlib_21-41_pdf.pdf,18,"Matplotlib, Release 2.0.2\n0 200 4000\n100\n20...","(18, 0.006106853485107422)"


In [97]:
pd.DataFrame(results).transpose()

Unnamed: 0,Extracted_txt_pypdf,timelist
matplotlib_41-61_pdf.pdf,"{0: 'Matplotlib, Release 2.0.2 0 10 20 30 40 5...","[(0, 0.01674795150756836), (1, 0.0066199302673..."
matplotlib_61-81_pdf.pdf,"{0: 'Matplotlib, Release 2.0.2 1.01.52.0y-labe...","[(0, 0.007433891296386719), (1, 0.007052183151..."
latex_pdf.pdf,{0: 'Sample PDF Document RobertMaron Grzegorz ...,"[(0, 0.0006620883941650391), (1, 0.00018310546..."
basketball_pdf.pdf,{0: 'The Global Game: Basketball's Unifying Po...,"[(0, 0.005740165710449219), (1, 0.022172927856..."
notion_pdf.pdf,{0: 'Gen AI Intr oduction  Not es 1G e n A I...,"[(0, 0.005805015563964844), (1, 0.001660823822..."
matplotlib_1-21_pdf.pdf,"{0: 'Matplotlib Release 2.0.2 John Hunter, Dar...","[(0, 0.001112222671508789), (1, 5.197525024414..."
chess_pdf.pdf,{0: 'Author: Prateek 1 Chec...,"[(0, 0.004670143127441406), (1, 0.008870840072..."
sleep_pdf.pdf,{0: ' Sleep Deprivation: Your Body's Silent ...,"[(0, 0.006541252136230469), (1, 0.006972789764..."
matplotlib_81-101_pdf.pdf,"{0: 'Matplotlib, Release 2.0.2 All of this ﬂex...","[(0, 0.00846409797668457), (1, 0.0056309700012..."
genai_pdf.pdf,{0: ' Generative AI: Your New Digi...,"[(0, 0.009430170059204102), (1, 0.008116960525..."


---