In [10]:
import pymupdf  # PyMuPDF
from PIL import Image, ImageDraw
import io

def extract_text_from_pdf(input_pdf_path):
    """
    Extracts text from the PDF while keeping position data.
    """
    document = pymupdf.open(input_pdf_path)
    text_data = []
    
    for page_num, page in enumerate(document):
        words = page.get_text("words")  # Get words with their bounding boxes
        text_data.append((page_num, words))
    
    document.close()
    return text_data


def invert_background_and_graphics(input_pdf_path, output_pdf_path):
    """
    Inverts the colors of the PDF background and graphical elements while keeping text separate.
    """
    document = pymupdf.open(input_pdf_path)
    
    for page in document:
        # Create a white overlay (inverting background)
        rect = page.rect
        shape = page.new_shape()
        shape.draw_rect(rect)
        shape.finish(color=(1, 1, 1), fill=(0, 0, 0))  # Inverts background
        shape.commit()

        # Process and invert images in the PDF
        images = page.get_images(full=True)
        for img in images:
            xref = img[0]
            base_image = document.extract_image(xref)
            image_bytes = base_image["image"]

            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            inverted_image = Image.eval(pil_image, lambda x: 255 - x)  # Invert colors

            # Convert back to bytes
            img_byte_arr = io.BytesIO()
            inverted_image.save(img_byte_arr, format="PNG")
            img_byte_arr.seek(0)

            # Replace image in PDF
            page.insert_image(page.rect, stream=img_byte_arr.read())

    document.save(output_pdf_path)
    document.close()


def add_text_back_to_pdf(original_text, processed_pdf_path, final_pdf_path):
    """
    Adds extracted text back to the inverted PDF.
    """
    document = pymupdf.open(processed_pdf_path)

    for page_num, words in original_text:
        page = document[page_num]
        for word in words:
            # print(word)
            x0, y0, x1, y1, text, _, _, _ = word  # Extract bounding box and text
            page.insert_text((x0, y0), text, color=(1, 1, 1), fontsize=9)  # White text

    document.save(final_pdf_path)
    document.close()


# === Running the process ===
input_pdf = "/home/siddhesh/Downloads/SampleContract-Shuttle.pdf"
inverted_pdf = "./inverted.pdf"
final_pdf = "./final_output.pdf"

# Step 1: Extract text
text_data = extract_text_from_pdf(input_pdf)

# print(text_data)

# Step 2: Invert background and images
invert_background_and_graphics(input_pdf, inverted_pdf)

# Step 3: Add text back
add_text_back_to_pdf(text_data, inverted_pdf, final_pdf)

print("PDF processing complete. Check final_output.pdf")

PDF processing complete. Check final_output.pdf
