# PDF Parsing Libraries

In [54]:
from thefuzz import fuzz
from pathlib import Path
from pypdf import PdfReader
from pdfminer.high_level import extract_text
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1
import pymupdf
import pandas as pd
import numpy as np

In [2]:
PDF_FILES_PATH = Path.cwd() / '../../data/raw'
LIBRARY_OUTPUT = Path.cwd() / '../../data/text extractions'
MANUAL_PDF = Path.cwd() / '../../data/raw/ground truth'

## PyPDF

### PDF 1

#### Text Extraction

In [10]:
#doc = pymupdf.open(PDF_FILES_PATH / 'PDF1.pdf')
with open(LIBRARY_OUTPUT / "pdf1_pypdf_output.txt", "w") as out:
    reader = PdfReader(PDF_FILES_PATH / 'chess_pdf.pdf')
    number_of_pages = len(reader.pages)
    page = reader.pages[0]
    out.write(page.extract_text()) # write text of page

In [6]:
reader = PdfReader(PDF_FILES_PATH / 'PDF1.pdf')
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()

Observations:
1. Overall maintained it's general format(paragraphs)
2. In the pdf, there was a space line between 2 lines; the library didn't implement this. 
3. Lot of commands and operations to do just to extract text from a pdf.

#### Similarity Ratio

In [8]:
with open(MANUAL_PDF / 'pdf1_manual.txt') as orig_file:
    original_txt = orig_file.read()
with open(LIBRARY_OUTPUT / 'pdf1_pypdf_output.txt') as lib_file:
    lib_text = lib_file.read()

In [9]:
fuzz.ratio(original_txt, lib_text)          # Similarity ratio for pypdf - out of 100.

90

This library is fairly accurate; however, it is relatively complicated to use -- it's a lot of code just to extract text from a pdf. 

---

## Pymupdf

### PDF 1

#### Text Extraction

In [17]:
doc = pymupdf.open(PDF_FILES_PATH / 'PDF1.pdf')
#out = open("pdf1_output.txt", "wb") # create a text output
with open(LIBRARY_OUTPUT / "pdf1_pymupdf_output.txt", "w") as out:
    for page in doc: # iterate the document pages
        text = page.get_text(sort=True) # get plain text (is in UTF-8)
        out.write(text) # write text of page

Observations:
1. Process to extract text seems relatively simpler than using pypdf.
2. Same problem as pypdf, didn't register the space line between the two lines.

#### Similarity Ratio

In [12]:
with open(MANUAL_PDF / 'pdf1_manual.txt') as orig_file:
    original_txt = orig_file.read()
with open(LIBRARY_OUTPUT / 'pdf1_pymupdf_output.txt') as lib_file:
    lib_text = lib_file.read()

In [13]:
fuzz.ratio(original_txt, lib_text)

97

---

## pdfminer.six

### PDF 1

#### Text Extraction

In [14]:
text = extract_text(PDF_FILES_PATH / 'PDF1.pdf', codec='utf-8')
with open(LIBRARY_OUTPUT / 'pdf1_pdfminer_output.txt', 'w') as out:
    out.write(text)

#### Observations
1. Much simpler than the last two; very intuitive
2. Unlike the other two, this library registed the space line added between the two lines, and added it in the txt file created.

#### Similarity Ratio

In [15]:
with open(MANUAL_PDF / 'pdf1_manual.txt') as orig_file:
    original_txt = orig_file.read()
with open(LIBRARY_OUTPUT / 'pdf1_pdfminer_output.txt') as lib_file:
    lib_text = lib_file.read()

In [16]:
fuzz.ratio(original_txt, lib_text)          # Similarity ratio for pypdf - out of 100.

97

---

In [48]:
def pypdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pypdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    file = PdfReader(file_path)
    for page_num in range(len(file.pages)):
        pdf_dict[f'{str(test_pdf).split('/')[-1]} ~ {page_num}'] = file.pages[page_num].extract_text()
    return pdf_dict

In [49]:
def pymupdf_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pymupdf page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    file = pymupdf.open(file_path)
    pdf_dict = {}
    for page_num in range(file.page_count):
        text = file[page_num].get_text()
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
    return pdf_dict

In [82]:
def pdfminersix_txt_extraction(file_path: Path) -> dict[str, str]:
    """
    Helper function to extract text from a pdf using pdfminer.six page-wise.

    Args:
        file_path: Path of the pdf file

    Returns:
        dict: key: page number, value: page text
    """
    pdf_dict = {}
    with open(test_path, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        pages = resolve1(doc.catalog['Pages'])
        pages_count = pages.get('Count', 0)
    for page_num in range(pages_count+1):
        text = extract_text(file_path, page_numbers=[page_num])
        pdf_dict[f'{str(file_path).split('/')[-1]} ~ {page_num}'] = text
    return pdf_dict

In [89]:
def pdf_dataframe(pypdf_dict: dict, pymupdf_dict: dict, pdfminersix_dict: dict) -> pd.DataFrame:
    pdf_df = pd.DataFrame({'PDF_ID': list(pypdf_dict.keys())})
    pdf_df['pypdf_txt'] = list(pypdf_dict.values())
    pdf_df['pymupdf_txt'] = list(pymupdf_dict.values())
    pdf_df['pdfminersix_text'] = list(pdfminersix_dict.values())
    return pdf_df

In [90]:
pdf_dataframe(pypdf_dict=pypdf_dict, pymupdf_dict=pymupdf_dict, pdfminersix_dict=pdfminersix_dict)

Unnamed: 0,PDF_ID,pypdf_txt,pymupdf_txt,pdfminersix_text
0,chess_pdf.pdf ~ 0,TheGlobal Game: Basketball'sUnifyingPower inth...,The Global Game: Basketball's Unifying\nPower ...,The Global Game: Basketball's Unifying\nPower ...
1,chess_pdf.pdf ~ 1,FromPeachBasketstoGlobalPhenomenon\nThestorybe...,From Peach Baskets to Global Phenomenon\nThe s...,From Peach Baskets to Global Phenomenon\n\nThe...
2,chess_pdf.pdf ~ 2,TheEconomicSlamDunk\nBasketball'seconomicimpac...,The Economic Slam Dunk\nBasketball's economic ...,The Economic Slam Dunk\n\nBasketball's economi...
3,chess_pdf.pdf ~ 3,UrbanRenaissanceThroughBasketball\nThesport'si...,Urban Renaissance Through Basketball\nThe spor...,Urban Renaissance Through Basketball\n\nThe sp...


In [68]:
pypdf_dict = pypdf_txt_extraction(PDF_FILES_PATH / 'basketball_pdf.pdf')
pdf_df = pd.DataFrame({'PDF_ID': list(pypdf_dict.keys())})
pdf_df['pypdf_txt'] = list(pypdf_dict.values())

In [69]:
pdf_df

Unnamed: 0,PDF_ID,pypdf_txt
0,chess_pdf.pdf ~ 0,TheGlobal Game: Basketball'sUnifyingPower inth...
1,chess_pdf.pdf ~ 1,FromPeachBasketstoGlobalPhenomenon\nThestorybe...
2,chess_pdf.pdf ~ 2,TheEconomicSlamDunk\nBasketball'seconomicimpac...
3,chess_pdf.pdf ~ 3,UrbanRenaissanceThroughBasketball\nThesport'si...


In [75]:
pymupdf_dict = pymupdf_txt_extraction(PDF_FILES_PATH / 'basketball_pdf.pdf')
#pdf_df['PDF_ID'] = list(pymupdf_dict.keys())
pdf_df['pymupdf_txt'] = list(pymupdf_dict.values())

In [76]:
pdf_df

Unnamed: 0,PDF_ID,pypdf_txt,pymupdf_txt
0,basketball_pdf.pdf ~ 0,TheGlobal Game: Basketball'sUnifyingPower inth...,The Global Game: Basketball's Unifying\nPower ...
1,basketball_pdf.pdf ~ 1,FromPeachBasketstoGlobalPhenomenon\nThestorybe...,From Peach Baskets to Global Phenomenon\nThe s...
2,basketball_pdf.pdf ~ 2,TheEconomicSlamDunk\nBasketball'seconomicimpac...,The Economic Slam Dunk\nBasketball's economic ...
3,basketball_pdf.pdf ~ 3,UrbanRenaissanceThroughBasketball\nThesport'si...,Urban Renaissance Through Basketball\nThe spor...


In [83]:
pdfminersix_dict = pdfminersix_txt_extraction(PDF_FILES_PATH / 'basketball_pdf.pdf')
#pdf_df['PDF_ID'] = list(pdfminersix_dict.keys())
pdf_df['pdfminersix_text'] = list(pdfminersix_dict.values())

In [85]:
pdf_df

Unnamed: 0,PDF_ID,pypdf_txt,pymupdf_txt,pdfminersix_text
0,basketball_pdf.pdf ~ 0,TheGlobal Game: Basketball'sUnifyingPower inth...,The Global Game: Basketball's Unifying\nPower ...,The Global Game: Basketball's Unifying\nPower ...
1,basketball_pdf.pdf ~ 1,FromPeachBasketstoGlobalPhenomenon\nThestorybe...,From Peach Baskets to Global Phenomenon\nThe s...,From Peach Baskets to Global Phenomenon\n\nThe...
2,basketball_pdf.pdf ~ 2,TheEconomicSlamDunk\nBasketball'seconomicimpac...,The Economic Slam Dunk\nBasketball's economic ...,The Economic Slam Dunk\n\nBasketball's economi...
3,basketball_pdf.pdf ~ 3,UrbanRenaissanceThroughBasketball\nThesport'si...,Urban Renaissance Through Basketball\nThe spor...,Urban Renaissance Through Basketball\n\nThe sp...


In [2]:
from Levenshtein import distance

In [3]:
def metrics(pypdf_dict: dict, pymupdf_dict: dict, pdfminersix_dict: dict):
    
    pass

In [4]:
str1 = "kitten"
str2 = "sitting"

distance = distance(str1, str2)
print(distance)  # Output: 3

3
