In [None]:
import pymupdf  # PyMuPDF
import pytesseract
from PIL import Image
import io
import os

In [None]:
# Define your personal information
personal_info = {
    "name": "John Doe",
    "birth_date": "01/01/1990",
    "address": "123 Main St, Anytown, USA",
    "phone": "(123) 456-7890"
}

# Keywords to search for each piece of information
keywords = {
    "name": ["name", "full name", "jméno"],
    "surname": ["surname", "příjmení"],
    "birth_date": ["birth date", "dob", "date of birth", "datum a místo narození", "datum"],
    "address": ["address", "residence"],
    "phone": ["phone", "contact number"]
}

def pdf_page_to_image(page, dpi=300):
    # Calculate scaling factor for the desired DPI
    zoom = dpi / 72  # 72 DPI is the default resolution
    mat = pymupdf.Matrix(zoom, zoom)  # Create a transformation matrix for zooming

    # Render the page with the scaling factor applied
    pix = page.get_pixmap(matrix=mat)  # Render page with increased DPI
    img = Image.open(io.BytesIO(pix.tobytes("png")))  # Convert to PIL image
    return img


def detect_underline_area(ocr_data, start_x, start_y, width):
    for i, word in enumerate(ocr_data['text']):
        if '...' in word or '____' in word:
            # Calculate the bounding box for the underline pattern
            x, y, w, h = ocr_data["left"][i], ocr_data["top"][i], ocr_data["width"][i], ocr_data["height"][i]
            # Check if it's within an appropriate area horizontally aligned with the field
            if start_y - 10 <= y <= start_y + 20 and x > start_x:
                return (x, y)  # Return the position to place text

# Function to apply OCR and overlay text on PDF
def fill_pdf_using_ocr(input_pdf_path, output_pdf_path, data, keywords):
    # Open the PDF
    pdf = pymupdf.open(input_pdf_path)
    
    # Loop through each page
    for page_num in range(pdf.page_count):
        page = pdf[page_num]
        img = pdf_page_to_image(page, dpi=300)
        # Convert PDF page to an image for OCR
        # pix = page.get_pixmap()
        # img = Image.open(io.BytesIO(pix.tobytes("png")))
        img.save(f"output_image{page_num}.png", "PNG")
        # Perform OCR on the image
        ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
        print(ocr_data["text"])
        # Loop through OCR detected words to find locations for each keyword
        for i, word in enumerate(ocr_data["text"]):
            word = word.lower()
            for field, kw_list in keywords.items():
                if any(keyword in word for keyword in kw_list):
                    print("HIT ------------>", word)
                    # Get the bounding box of the keyword
                    x, y, w, h = (ocr_data["left"][i], ocr_data["top"][i],
                                  ocr_data["width"][i], ocr_data["height"][i])
                    
                    # Detect underline area or whitespace nearby
                    text_position = None

                    # Check directly to the right for empty space
                    if x + w + 50 < ocr_data["width"][i]:  # Look 50 pixels to the right
                        text_position = (x + w + 10, y + h // 2)  # Offset slightly to the right
                    
                    # If no space on the right, check below the keyword for underline
                    if not text_position:
                        text_position = detect_underline_area(ocr_data, x, y, w)

                    # If no underline or space was detected, place text below
                    if not text_position:
                        text_position = (x, y + h + 10)  # Offset below the keyword

                    # Overlay text if we found a position
                    if text_position and field in data:
                        print("INSERTTING:", data[field])
                        page.insert_text(text_position, data[field], fontsize=10, color=(0, 0, 0))
    
    # Save the filled PDF
    pdf.save(output_pdf_path)
    pdf.close()
    print("Form filled and saved successfully!")

# Path to the input and output PDFs
input_pdf = os.path.join("data", "chodov.pdf")
output_pdf = os.path.join("output", "output3.pdf")

# Call the function to fill the form using OCR
fill_pdf_using_ocr(input_pdf, output_pdf, personal_info, keywords)
