# PDF Parsing Libraries

In [19]:
from thefuzz import fuzz
from pathlib import Path
from pypdf import PdfReader
from pdfminer.high_level import extract_text
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1
import pymupdf
import pandas as pd
import numpy as np
import time

In [7]:
PDF_FILES_PATH = Path.cwd() / '../../data/raw/test pdfs'
LIBRARY_OUTPUT = Path.cwd() / '../../data/text extractions'
MANUAL_PDF = Path.cwd() / '../../data/raw/ground truth'

### Functions Testing

In [62]:
def pypdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pypdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    file = PdfReader(file_path)
    time_list = []
    for page_num in range(len(file.pages)):
        key = f'{str(file_path).split('/')[-1]} ~ {page_num}'
        start = time.time()
        extracted_txt = file.pages[page_num].extract_text()
        end = time.time()
        pdf_dict[key] = extracted_txt
        time_list.append(end-start)
    return pdf_dict, time_list

In [63]:
def pymupdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pymupdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    file = pymupdf.open(file_path)
    pdf_dict = {}
    time_list = []
    for page_num in range(file.page_count):
        start = time.time()
        text = file[page_num].get_text()
        end = time.time()
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
        time_list.append(end-start)
    return pdf_dict, time_list

In [58]:
def pdfminersix_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pdfminer.six page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    time_list = []
    with open(file_path, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        pages = resolve1(doc.catalog['Pages'])
        pages_count = pages.get('Count', 0)
    for page_num in range(pages_count):
        start = time.time()
        text = extract_text(file_path, page_numbers=[page_num])
        end = time.time()
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
        time_list.append(end-start)
    return pdf_dict, time_list

In [64]:
pypdf_output = pypdf_txt_extraction(PDF_FILES_PATH / 'chess_pdf.pdf')
pymupdf_output = pymupdf_txt_extraction(PDF_FILES_PATH / 'chess_pdf.pdf')
pdfminer_output = pdfminersix_txt_extraction(PDF_FILES_PATH / 'chess_pdf.pdf')

In [65]:
pdf_df = pd.DataFrame({'PDF_ID': list(pypdf_output[0].keys())})
pdf_df['pypdf_txt'] = list((pypdf_output[0]).values())
pdf_df['pymupdf_txt'] = list((pymupdf_output[0]).values())
pdf_df['pdfminersix_text'] = list((pdfminer_output[0]).values())
pdf_df['pypdf_time'] = list(pypdf_output[1])
pdf_df['pymupdf_time'] = list(pymupdf_output[1])
pdf_df['pdfminersix_time'] = list(pdfminer_output[1])
pdf_df

Unnamed: 0,PDF_ID,pypdf_txt,pymupdf_txt,pdfminersix_text,pypdf_time,pymupdf_time,pdfminersix_time
0,chess_pdf.pdf ~ 0,Author: Prateek \n \n 1 \n \n \n Chec...,Author: Prateek \n \n \n \n1\n \n \n Checkmat...,Author: Prateek \n\n Checkmate Through T...,0.034179,0.060958,0.054039
1,chess_pdf.pdf ~ 1,"Author: Prateek \n \n 2 \n \nChess, the gam...","Author: Prateek \n \n \n \n2\n \nChess, the ga...","Author: Prateek \n\nChess, the game of kings a...",0.030468,0.005449,0.023171
2,chess_pdf.pdf ~ 2,Author: Prateek \n \n 3 \n \nThe Birth of C...,Author: Prateek \n \n \n \n3\n \nThe Birth of ...,Author: Prateek \n\nThe Birth of Competitive C...,0.013944,0.003093,0.020734


---