In [1]:
import PyPDF2
import pymupdf
import pdfplumber
import os
import tika

## Metadata

In [2]:

for document_name in os.listdir("data"):
    if document_name[-4:] == ".pdf":
        input_pdf = os.path.join("data", document_name)
        with open(input_pdf, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            
            # Extract metadata
            metadata = reader.metadata
            print(f"Document Name: {document_name}")
            print("Metadata:")
            for key, value in metadata.items():
                print(f"{key}: {value}")
            print("\n")

Document Name: chodov.pdf
Metadata:
/Author: Monika Ferdová
/Creator: Microsoft® Word 2019
/CreationDate: D:20240419115023+02'00'
/ModDate: D:20240419115023+02'00'
/Producer: Microsoft® Word 2019




## Different libs 

In [5]:
fname1 = os.path.join("data", "chodov.pdf")
fname2 = os.path.join("data", "chodov.pdf")

personal_info = {
    "jméno": "John Doe",
    "Jméno": "Kyryl smith",
}


### PyMuPDF 

In [6]:
import pymupdf 
# Open the PDF file
pdf_document = pymupdf.open(fname1)

# Loop through each page
for page in pdf_document:
    # Search for the placeholder text
    for key in personal_info.keys():
        text_instances = page.search_for(key)
        for inst in text_instances:
            # Get coordinates of the placeholder
            x0, y0, x1, y1 = inst
            print(f'name: {key}, coords: {inst}')
            # Define your personal information
            page.insert_text((x0, y0), personal_info[key], fontsize=12)
pdf_document.save("output\\PyMuPDF.pdf")
pdf_document.close()

name: jméno, coords: Rect(67.46399688720703, 288.90252685546875, 92.00836944580078, 298.95721435546875)
name: jméno, coords: Rect(69.26100158691406, 650.9925537109375, 93.69737243652344, 661.0472412109375)
name: Jméno, coords: Rect(67.46399688720703, 288.90252685546875, 92.00836944580078, 298.95721435546875)
name: Jméno, coords: Rect(69.26100158691406, 650.9925537109375, 93.69737243652344, 661.0472412109375)
name: jméno, coords: Rect(60.599998474121094, 614.362548828125, 87.66436767578125, 624.417236328125)
name: jméno, coords: Rect(60.2400016784668, 694.072509765625, 87.30437469482422, 704.127197265625)
name: Jméno, coords: Rect(60.599998474121094, 614.362548828125, 87.66436767578125, 624.417236328125)
name: Jméno, coords: Rect(60.2400016784668, 694.072509765625, 87.30437469482422, 704.127197265625)
name: jméno, coords: Rect(126.13343811035156, 578.22998046875, 154.51004028320312, 589.27001953125)
name: Jméno, coords: Rect(126.13343811035156, 578.22998046875, 154.51004028320312, 589.2

In [8]:
%%capture cap --no-stderr
for doc_name in os.listdir("data"):
    doc = pymupdf.open(f"data\\{doc_name}")
    page1 = doc[0]
    print(f'NAME: {doc_name}')
    print(page1.get_text())
    print('#############################################################################################')


with open('output\\pymupdf\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap

### PDFplumber

In [9]:
%%capture cap --no-stderr
import pdfplumber

for doc_name in os.listdir("data"):
    with pdfplumber.open(f"data\\{doc_name}") as pdf:
        page1 = pdf.pages[0]
        print(f'NAME: {doc_name}')
        print(page1.extract_text())
        print('#############################################################################################')


with open('output\\PDFplumber\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap


### PyPDF2

In [10]:
%%capture cap --no-stderr
from PyPDF2 import PdfReader

for doc_name in os.listdir("data"):
    reader = PdfReader(f"data\\{doc_name}")
    page1 = reader.pages[0]
    print(f'NAME: {doc_name}')
    print(page1.extract_text())
    print('#############################################################################################')


with open('output\\pypdf2\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap

### Tika

In [11]:
%%capture cap --no-stderr

import tika
from tika import parser
for doc_name in os.listdir("data"):
    parsed = parser.from_file(f"data\\{doc_name}")
    print(f'NAME: {doc_name}')
    print("Metadata: ")
    print(parsed["metadata"])
    print("Content: ")
    print(parsed["content"])

    print('#############################################################################################')


with open('output\\tika\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap

2024-11-20 16:51:53,904 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


### pytesseract

In [12]:
import pytesseract
print(pytesseract.get_languages(config=''))

['ces', 'eng', 'osd']


In [13]:
import pytesseract
from PIL import Image
import io


def pdf_page_to_image(page, dpi=300):
    # Calculate scaling factor for the desired DPI
    zoom = dpi / 72  # 72 DPI is the default resolution
    mat = pymupdf.Matrix(zoom, zoom)  # Create a transformation matrix for zooming

    # Render the page with the scaling factor applied
    pix = page.get_pixmap(matrix=mat)  # Render page with increased DPI
    img = Image.open(io.BytesIO(pix.tobytes("png")))  # Convert to PIL image
    return img


def detect_underline_area(ocr_data, start_x, start_y, width):
    for i, word in enumerate(ocr_data['text']):
        if '...' in word or '____' in word:
            # Calculate the bounding box for the underline pattern
            x, y, w, h = ocr_data["left"][i], ocr_data["top"][i], ocr_data["width"][i], ocr_data["height"][i]
            # Check if it's within an appropriate area horizontally aligned with the field
            if start_y - 10 <= y <= start_y + 20 and x > start_x:
                return (x, y)  # Return the position to place text

# Function to apply OCR and overlay text on PDF
for doc_name in os.listdir("data"):
    pdf = pymupdf.open(f"data\\{doc_name}")

    for page_num in range(pdf.page_count):
        page = pdf[page_num]
        img = pdf_page_to_image(page, dpi=1000)
        ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang='ces')
        print(f'NAME: {doc_name}')
        print(ocr_data["text"])

        print('#############################################################################################')
        break





NAME: chodov.pdf
['', '', '', '', 'DOMOV', 'PRO', 'SENIORY', 'CHODOV', '', 'Donovalská', '2222/31,', '149', '00', 'PRAHA', '4', '—', 'CHODOV', '', '', '', '   ', '', '', '', '  ', ' ', '', '', '', 'DVOMOV', 'PRO', 'SŠEMLIORY', '', '', '', 'CHODOV', '', 'Zádost', '', '', '', 'o', 'pobytovou', 'službu', 'Domova', 'pro', 'seniory', 'Chodov', '', '(S', '49', 'zákona', 'č.', '108/2006', 'Sb.,', 'o', 'sociálních', 'službách)', '', '', '', 'Datum', 'podání', 'žádosti', '(podací', 'razítko)', '', '', '', ' ', '', '', '', '1.', 'OSOBNÍ', 'ÚDAJE', 'ŽADATELE', '', '', '', 'Národnost*', 'státní', 'příslušnost', '', '', '', 'rodinný', 'stav*:', '', '', '', 'svobodný/á', ' ženatý/vdaná', "'", "partner'", 'druh/družka', 'rozvedený/á', ' vdovec/vdova', '', 'bydliště:', '', '', '', '*nepovinný', 'údaj', '', 'žadatel', 'je', 'omezen', 've', 'svéprávnosti:“', 'ano', 'ne', '', '', '', 'podpůrná', 'opatření', 'při', 'narušení', 'schopnosti', 'zletilého', 'právně', "'|ednat:2", '', '', '', 'předběžně', 'pro