In [1]:
import cv2
import pytesseract
from pytesseract import Output
from docx import Document
from docx.shared import Pt, Inches
from pdf2image import convert_from_path
import os
import numpy as np
import io


In [2]:
def extract_text_and_elements_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    extracted_data = []

    for i, page_image in enumerate(pages):
        img_byte_arr = io.BytesIO()
        page_image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()

        image = cv2.imdecode(np.frombuffer(img_byte_arr, np.uint8), cv2.IMREAD_COLOR)

        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT)
        extracted_data.append(ocr_data)

    return extracted_data

def recreate_pdf_layout_in_word(extracted_data, output_docx_path):
    doc = Document()

    for page_data in extracted_data:
        for i, word in enumerate(page_data["text"]):
            if word.strip():
                x, y, w, h = (page_data["left"][i], page_data["top"][i],
                              page_data["width"][i], page_data["height"][i])
                paragraph = doc.add_paragraph(word)
                run = paragraph.runs[0]
                run.font.size = Pt(10)

        doc.add_page_break()
    doc.save(output_docx_path)


def convert_pdf_to_word_with_identical_layout(pdf_path, output_docx_path):
    print("Extracting text and elements from PDF...")
    extracted_data = extract_text_and_elements_from_pdf(pdf_path)

    print("Recreating layout in Word document...")
    recreate_pdf_layout_in_word(extracted_data, output_docx_path)

    print(f"Conversion completed! Output saved at: {output_docx_path}")

In [None]:
pdf_path = "docs/sample test 1.pdf"
output_docx_path = "output.docx"

# Run the process
convert_pdf_to_word_with_identical_layout(pdf_path, output_docx_path)

Extracting text and elements from PDF...
