In [3]:
!pip install pymupdf pillow pytesseract pdf2image
!apt-get update -y
!apt-get install -y tesseract-ocr libreoffice

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pymupdf, pdf2image
Successfully installed pdf2image-1.17.0 pymupdf-1.26.6 pytesseract-0.3.13
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://cli.github.com/packages stable InRelease
Get:4 https://developer.download

In [4]:
import os
import subprocess
import fitz  # PyMuPDF
from PIL import Image, ImageDraw
import pytesseract
import pandas as pd
from pdf2image import convert_from_path

from google.colab import files


In [5]:
class DocumentHighlighter:
    def __init__(self, search_text: str):
        self.search_text = search_text.strip()
        if not self.search_text:
            raise ValueError("Search text cannot be empty.")

    # ---------- Public API ----------
    def process(self, input_path: str) -> str:
        """
        Dispatch based on file extension.
        Returns path to the *new* annotated output file.
        """
        ext = os.path.splitext(input_path)[1].lower()

        if ext in [".pdf"]:
            return self._highlight_pdf(input_path)

        elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]:
            return self._highlight_image(input_path)

        elif ext in [".xlsx", ".xls", ".docx", ".doc"]:
            # Step 1: convert Excel/Word to PDF
            pdf_path = self._convert_to_pdf(input_path)
            # Step 2: highlight PDF
            return self._highlight_pdf(pdf_path, base_name=os.path.basename(input_path))

        else:
            raise ValueError(f"Unsupported file type: {ext}")

    # ---------- Conversion helpers ----------
    def _convert_to_pdf(self, input_path: str) -> str:
        """
        Use LibreOffice (soffice) to convert Word/Excel to PDF.
        """
        input_path = os.path.abspath(input_path)
        out_dir = os.path.dirname(input_path)

        # Run LibreOffice headless conversion
        result = subprocess.run(
            [
                "soffice",
                "--headless",
                "--convert-to",
                "pdf",
                "--outdir",
                out_dir,
                input_path,
            ],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )

        if result.returncode != 0:
            print("LibreOffice error:\n", result.stderr)
            raise RuntimeError("Failed to convert to PDF.")

        pdf_path = os.path.splitext(input_path)[0] + ".pdf"
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"Expected PDF not found at {pdf_path}")

        return pdf_path

    # ---------- PDF highlighting ----------
    def _highlight_pdf(self, pdf_path: str, base_name: str = None) -> str:
        """
        Search text and draw red unfilled bounding boxes on a COPY of the PDF.
        """
        doc = fitz.open(pdf_path)
        search = self.search_text

        matches_total = 0

        for page in doc:
            # Get all bounding boxes of the search text on this page
            # flags=0 is case-sensitive; flags=1 makes it case-insensitive
            rects = page.search_for(search, flags=1)
            for rect in rects:
                matches_total += 1
                annot = page.add_rect_annot(rect)
                # Red stroke, transparent fill
                annot.set_colors(stroke=(1, 0, 0))
                annot.set_border(width=1)
                annot.update()

        if matches_total == 0:
            print("No matches found in PDF.")

        # Save as new file
        base = base_name if base_name else os.path.basename(pdf_path)
        base_no_ext = os.path.splitext(base)[0]
        output_path = f"{base_no_ext}_highlighted.pdf"
        doc.save(output_path)
        doc.close()

        print(f"Created: {output_path} (matches: {matches_total})")
        return output_path

    # ---------- Image highlighting with OCR ----------
    def _highlight_image(self, image_path: str) -> str:
        """
        Use Tesseract OCR to find text positions and draw red rectangles
        around matches on a COPY of the image.
        """
        img = Image.open(image_path).convert("RGB")
        data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DATAFRAME)

        # Clean NaNs
        data = data.dropna(subset=["text"])
        data["text_norm"] = data["text"].str.strip().str.lower()

        target_words = self.search_text.lower().split()

        if not target_words:
            raise ValueError("Search text is empty after normalization.")

        draw = ImageDraw.Draw(img)
        matches_total = 0

        # Group by line (block_num, par_num, line_num)
        group_cols = ["block_num", "par_num", "line_num"]
        for _, line_df in data.groupby(group_cols):
            words = line_df["text_norm"].tolist()
            coords = list(zip(line_df["left"], line_df["top"],
                              line_df["width"], line_df["height"]))

            # Slide a window of len(target_words)
            n = len(words)
            m = len(target_words)
            for start in range(n - m + 1):
                window = words[start:start + m]
                if window == target_words:
                    # Combine bounding boxes of these words
                    xs = []
                    ys = []
                    xe = []
                    ye = []
                    for i in range(start, start + m):
                        l, t, w, h = coords[i]
                        xs.append(l)
                        ys.append(t)
                        xe.append(l + w)
                        ye.append(t + h)

                    bbox = (min(xs), min(ys), max(xe), max(ye))

                    # Draw a red, unfilled rectangle
                    draw.rectangle(bbox, outline="red", width=2)
                    matches_total += 1

        if matches_total == 0:
            print("No matches found in image (OCR).")

        base = os.path.basename(image_path)
        base_no_ext, _ = os.path.splitext(base)
        output_path = f"{base_no_ext}_highlighted.png"
        img.save(output_path)
        print(f"Created: {output_path} (matches: {matches_total})")

        return output_path


In [8]:
# 1) Upload file
print("Upload a PDF / Excel / Word / Image file...")
uploaded = files.upload()

if not uploaded:
    raise RuntimeError("No file uploaded.")

input_filename = next(iter(uploaded.keys()))
print(f"Uploaded: {input_filename}")

# 2) Ask for search text
search_text = input("Enter the text to search for: ").strip()
if not search_text:
    raise ValueError("Search text cannot be empty.")

# 3) Run highlighter
highlighter = DocumentHighlighter(search_text)
output_path = highlighter.process(input_filename)

# 4) Download result
print("Downloading annotated file...")
files.download(output_path)


Upload a PDF / Excel / Word / Image file...


Saving Instruction Sheet_AuditRAM.pdf to Instruction Sheet_AuditRAM (1).pdf
Uploaded: Instruction Sheet_AuditRAM (1).pdf
Enter the text to search for: The goal of this assignment is to create a Python program that takes a file and a text string as input, searches for the text within the file, and then generates an output file/view where the found text is highlighted with a red, unfilled bounding box overlay, without altering the original file content.
Created: Instruction Sheet_AuditRAM (1)_highlighted.pdf (matches: 8)
Downloading annotated file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>