In [None]:
import PyPDF2
import pymupdf
import pdfplumber
import os
import tika

## Metadata

In [None]:

for document_name in os.listdir("data"):
    if document_name[-4:] == ".pdf":
        input_pdf = os.path.join("data", document_name)
        with open(input_pdf, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            
            # Extract metadata
            metadata = reader.metadata
            print(f"Document Name: {document_name}")
            print("Metadata:")
            for key, value in metadata.items():
                print(f"{key}: {value}")
            print("\n")

## Different libs 

In [None]:
fname1 = os.path.join("data", "chodov.pdf")
fname2 = os.path.join("data", "chodov.pdf")

personal_info = {
    "jméno": "John Doe",
    "Jméno": "Kyryl smith",
}


### PyMuPDF 

In [None]:
import pymupdf 
# Open the PDF file
pdf_document = pymupdf.open(fname1)

# Loop through each page
for page in pdf_document:
    # Search for the placeholder text
    for key in personal_info.keys():
        text_instances = page.search_for(key)
        for inst in text_instances:
            # Get coordinates of the placeholder
            x0, y0, x1, y1 = inst
            print(f'name: {key}, coords: {inst}')
            # Define your personal information
            page.insert_text((x0, y0), personal_info[key], fontsize=12)
pdf_document.save("output\\PyMuPDF.pdf")
pdf_document.close()

In [None]:
%%capture cap --no-stderr
for doc_name in os.listdir("data"):
    doc = pymupdf.open(f"data\\{doc_name}")
    page1 = doc[0]
    print(f'NAME: {doc_name}')
    print(page1.get_text())
    print('#############################################################################################')


with open('output\\pymupdf\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap

### PDFplumber

In [None]:
%%capture cap --no-stderr
import pdfplumber

for doc_name in os.listdir("data"):
    with pdfplumber.open(f"data\\{doc_name}") as pdf:
        page1 = pdf.pages[0]
        print(f'NAME: {doc_name}')
        print(page1.extract_text())
        print('#############################################################################################')


with open('output\\PDFplumber\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap


### PyPDF2

In [None]:
%%capture cap --no-stderr
from PyPDF2 import PdfReader

for doc_name in os.listdir("data"):
    reader = PdfReader(f"data\\{doc_name}")
    page1 = reader.pages[0]
    print(f'NAME: {doc_name}')
    print(page1.extract_text())
    print('#############################################################################################')


with open('output\\pypdf2\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap

### Tika

In [None]:
%%capture cap --no-stderr

import tika
from tika import parser
for doc_name in os.listdir("data"):
    parsed = parser.from_file(f"data\\{doc_name}")
    print(f'NAME: {doc_name}')
    print("Metadata: ")
    print(parsed["metadata"])
    print("Content: ")
    print(parsed["content"])

    print('#############################################################################################')


with open('output\\tika\\get_text.txt', 'w') as f:
    f.write(cap.stdout)
del cap

### pytesseract

In [None]:
import pytesseract
print(pytesseract.get_languages(config=''))

In [None]:
import pytesseract
from PIL import Image
import io


def pdf_page_to_image(page, dpi=300):
    # Calculate scaling factor for the desired DPI
    zoom = dpi / 72  # 72 DPI is the default resolution
    mat = pymupdf.Matrix(zoom, zoom)  # Create a transformation matrix for zooming

    # Render the page with the scaling factor applied
    pix = page.get_pixmap(matrix=mat)  # Render page with increased DPI
    img = Image.open(io.BytesIO(pix.tobytes("png")))  # Convert to PIL image
    return img


def detect_underline_area(ocr_data, start_x, start_y, width):
    for i, word in enumerate(ocr_data['text']):
        if '...' in word or '____' in word:
            # Calculate the bounding box for the underline pattern
            x, y, w, h = ocr_data["left"][i], ocr_data["top"][i], ocr_data["width"][i], ocr_data["height"][i]
            # Check if it's within an appropriate area horizontally aligned with the field
            if start_y - 10 <= y <= start_y + 20 and x > start_x:
                return (x, y)  # Return the position to place text

# Function to apply OCR and overlay text on PDF
for doc_name in os.listdir("data"):
    pdf = pymupdf.open(f"data\\{doc_name}")

    for page_num in range(pdf.page_count):
        page = pdf[page_num]
        img = pdf_page_to_image(page, dpi=1000)
        ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang='ces')
        print(f'NAME: {doc_name}')
        print(ocr_data["text"])

        print('#############################################################################################')
        break

