# This is a simple PDF to WORD Docxs converter Script

In [1]:
import os
from pdf2docx import Converter
import pytesseract
from pdf2image import convert_from_path
from docx import Document

def convert_scanned_pdf_to_word(pdf_path, output_folder):
    """
    Converts a scanned PDF to Word document by:
    1. Converting PDF to images
    2. Performing OCR on the images
    3. Creating a Word document with the extracted text
    
    Args:
        pdf_path (str): Path to the PDF file
        output_folder (str): Folder to save the output Word document
    
    Returns:
        str: Path to the created Word document
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Extract filename without extension
    pdf_filename = os.path.basename(pdf_path)
    filename_without_ext = os.path.splitext(pdf_filename)[0]
    docx_output = os.path.join(output_folder, f"{filename_without_ext}.docx")
    
    # First try direct conversion (faster but might not work well for scanned PDFs)
    try:
        print(f"Attempting direct conversion of {pdf_filename}...")
        cv = Converter(pdf_path)
        cv.convert(docx_output)
        cv.close()
        
        # Check if the conversion produced meaningful content
        doc = Document(docx_output)
        text_content = "\n".join([para.text for para in doc.paragraphs])
        
        # If the document has content, return it
        if len(text_content.strip()) > 100:  # Arbitrary threshold to determine if content is meaningful
            print(f"Direct conversion successful: {docx_output}")
            return docx_output
        else:
            print("Direct conversion produced limited text. Trying OCR method...")
            # Delete the file to prepare for OCR method
            os.remove(docx_output)
    except Exception as e:
        print(f"Direct conversion failed: {e}")
    
    # For scanned PDFs, use OCR approach
    try:
        print(f"Converting {pdf_filename} to images for OCR processing...")
        # Convert PDF to images
        images = convert_from_path(pdf_path)
        
        # Create a new Word document
        doc = Document()
        
        # Process each page
        for i, image in enumerate(images):
            print(f"Processing page {i+1}/{len(images)}...")
            # Extract text using OCR
            text = pytesseract.image_to_string(image)
            
            # Add page break between pages (except for the first page)
            if i > 0:
                doc.add_page_break()
                
            # Add text to document
            doc.add_paragraph(text)
        
        # Save the document
        doc.save(docx_output)
        print(f"OCR conversion completed: {docx_output}")
        return docx_output
    
    except Exception as e:
        print(f"OCR conversion failed: {e}")
        return None

def batch_convert_pdfs(input_folder, output_folder):
    """
    Converts all PDFs in a folder to Word documents
    
    Args:
        input_folder (str): Folder containing PDF files
        output_folder (str): Folder to save the output Word documents
    """
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get all PDF files in the input folder
    pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]
    
    if not pdf_files:
        print(f"No PDF files found in {input_folder}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to convert")
    
    # Convert each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_folder, pdf_file)
        print(f"\nProcessing: {pdf_file}")
        output_path = convert_scanned_pdf_to_word(pdf_path, output_folder)
        if output_path:
            print(f"Conversion successful: {output_path}")
        else:
            print(f"Failed to convert: {pdf_file}")

In [3]:
# Change this:
convert_scanned_pdf_to_word("data/acta de inicio_1.pdf", "output")

# Don't do this - wildcards and root paths won't work:
# batch_convert_pdfs("data/*", "/output")

# Instead, do this:
batch_convert_pdfs("data", "output")

[INFO] Start to convert data/acta de inicio_1.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Attempting direct conversion of acta de inicio_1.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] Terminated in 0.79s.


Direct conversion produced limited text. Trying OCR method...
Converting acta de inicio_1.pdf to images for OCR processing...


[INFO] Start to convert data/SOLICITUD.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Processing page 1/2...
OCR conversion failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Found 7 PDF files to convert

Processing: SOLICITUD.pdf
Attempting direct conversion of SOLICITUD.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/1) Page 1
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/1) Page 1
[INFO] Terminated in 0.32s.
[INFO] Start to convert data/PLIEGOS.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Direct conversion produced limited text. Trying OCR method...
Converting SOLICITUD.pdf to images for OCR processing...
Processing page 1/1...
OCR conversion failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Failed to convert: SOLICITUD.pdf

Processing: PLIEGOS.pdf
Attempting direct conversion of PLIEGOS.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/11) Page 1
[INFO] (2/11) Page 2
[INFO] (3/11) Page 3
[INFO] (4/11) Page 4
[INFO] (5/11) Page 5
[INFO] (6/11) Page 6
[INFO] (7/11) Page 7
[INFO] (8/11) Page 8
[INFO] (9/11) Page 9
[INFO] (10/11) Page 10
[INFO] (11/11) Page 11
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/11) Page 1
[INFO] (2/11) Page 2
[INFO] (3/11) Page 3
[INFO] (4/11) Page 4
[INFO] (5/11) Page 5
[INFO] (6/11) Page 6
[INFO] (7/11) Page 7
[INFO] (8/11) Page 8
[INFO] (9/11) Page 9
[INFO] (10/11) Page 10
[INFO] (11/11) Page 11
[INFO] Terminated in 1.28s.
[INFO] Start to convert data/CONVOCATORIA.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Direct conversion successful: output/PLIEGOS.docx
Conversion successful: output/PLIEGOS.docx

Processing: CONVOCATORIA.pdf
Attempting direct conversion of CONVOCATORIA.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/1) Page 1
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/1) Page 1
[INFO] Terminated in 0.37s.
[INFO] Start to convert data/INVITACION.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Direct conversion produced limited text. Trying OCR method...
Converting CONVOCATORIA.pdf to images for OCR processing...
Processing page 1/1...
OCR conversion failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Failed to convert: CONVOCATORIA.pdf

Processing: INVITACION.pdf
Attempting direct conversion of INVITACION.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] Terminated in 0.69s.


Direct conversion produced limited text. Trying OCR method...
Converting INVITACION.pdf to images for OCR processing...


[INFO] Start to convert data/acta de inicio_2.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Processing page 1/2...
OCR conversion failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Failed to convert: INVITACION.pdf

Processing: acta de inicio_2.pdf
Attempting direct conversion of acta de inicio_2.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] Terminated in 0.79s.


Direct conversion produced limited text. Trying OCR method...
Converting acta de inicio_2.pdf to images for OCR processing...


[INFO] Start to convert data/acta de inicio_1.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Processing page 1/2...
OCR conversion failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Failed to convert: acta de inicio_2.pdf

Processing: acta de inicio_1.pdf
Attempting direct conversion of acta de inicio_1.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] Terminated in 0.78s.


Direct conversion produced limited text. Trying OCR method...
Converting acta de inicio_1.pdf to images for OCR processing...


[INFO] Start to convert data/MEMO SOLICITUD.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Processing page 1/2...
OCR conversion failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Failed to convert: acta de inicio_1.pdf

Processing: MEMO SOLICITUD.pdf
Attempting direct conversion of MEMO SOLICITUD.pdf...


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/1) Page 1
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/1) Page 1
[INFO] Terminated in 0.40s.


Direct conversion produced limited text. Trying OCR method...
Converting MEMO SOLICITUD.pdf to images for OCR processing...
Processing page 1/1...
OCR conversion failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Failed to convert: MEMO SOLICITUD.pdf
