In [2]:
!pip install PyMuPDF
!pip install pytesseract
!sudo apt install tesseract-ocr
!pip install python-docx

Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.6
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 

In [3]:
import os
import fitz
from PIL import Image, ImageDraw
import pytesseract
from openpyxl import load_workbook
from openpyxl.styles import Border, Side
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
from google.colab import files

In [12]:
# Set the Tesseract executable path (Crucial for Colab environment)
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'


def highlight_text_in_pdf(input_pdf_path, search_text, output_pdf_path):

    doc = None
    output_doc = None
    try:
        doc = fitz.open(input_pdf_path)
        # Create a new document to add highlights, iterating through original pages
        output_doc = fitz.open()
        found_count = 0

        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            # Copy the original page content to the new output document
            new_page = output_doc.new_page(width=page.rect.width, height=page.rect.height)
            new_page.show_pdf_page(new_page.rect, doc, page_num)

            text_instances = page.search_for(search_text)

            if text_instances:
                found_count += len(text_instances)
                for inst in text_instances:
                    # Draw an unfilled red rectangle around the found text instance
                    new_page.draw_rect(rect=inst, color=(1, 0, 0), width=2, fill=None)
        if found_count > 0:
            output_doc.save(output_pdf_path)
            print(f"PDF Success! Found {found_count} instances. Output saved locally in Colab.")
            return True
        else:
            print(f"PDF Warning: Text '{search_text}' not found. No output file created.")
            return False
    except Exception as e:
        print(f"Error processing PDF file: {e}")
        return False
    finally:
        if doc:
            doc.close()
        if output_doc:
            output_doc.close()


def highlight_text_in_image(input_image_path, search_text, output_image_path):
    try:
        img = Image.open(input_image_path).convert('RGB')
        draw = ImageDraw.Draw(img)
        data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
        n_boxes = len(data['level'])
        found_count = 0

        for i in range(n_boxes):
            detected_text = data['text'][i]
            if detected_text and search_text.lower() in detected_text.lower():
                found_count += 1
                (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
                draw.rectangle([(x, y), (x + w, y + h)], outline="red", width=3)

        if found_count > 0:
            img.save(output_image_path)
            print(f"Image Success! Found {found_count} instances. Output saved locally in Colab.")
            return True
        else:
             print(f"Image Warning: Text '{search_text}' not found via OCR. No output file created.")
             return False

    except pytesseract.TesseractNotFoundError:
        print("ERROR: Tesseract is not configured correctly in Colab.")
        return False
    except Exception as e:
        print(f"Error processing image: {e}")
        return False


def highlight_text_in_excel(input_excel_path, search_text, output_excel_path):
    try:
        wb = load_workbook(input_excel_path)
        red_side = Side(border_style="thin", color="FF0000")
        red_border = Border(top=red_side, bottom=red_side, left=red_side, right=red_side)

        found_count = 0

        for sheet in wb.worksheets:
            for row in sheet.iter_rows():
                for cell in row:
                    if cell.value and isinstance(cell.value, str) and search_text.lower() in str(cell.value).lower():
                        found_count += 1
                        cell.border = red_border

        if found_count > 0:
            wb.save(output_excel_path)
            print(f"Excel Success! Found {found_count} instances. Output saved locally in Colab.")
            print("Note: The 'bounding box' is represented by a red cell border.")
            return True
        else:
            print(f"Excel Warning: Text '{search_text}' not found. No output file created.")
            return False

    except Exception as e:
        print(f"Error processing Excel file: {e}")
        return False

def highlight_text_in_word(input_word_path, search_text, output_word_path):
    try:
        doc = Document(input_word_path)
        found_count = 0

        for paragraph in doc.paragraphs:
            if search_text.lower() in paragraph.text.lower():
                for run in paragraph.runs:
                    if search_text.lower() in run.text.lower():
                        found_count += 1
                        run.font.highlight_color = WD_COLOR_INDEX.RED

        if found_count > 0:
            doc.save(output_word_path)
            print(f"Word Success! Found {found_count} instances. Output saved locally in Colab.")
            print("Note: The 'bounding box' is represented by red text highlighting.")
            return True
        else:
            print(f"Word Warning: Text '{search_text}' not found. No output file created.")
            return False

    except Exception as e:
        print(f"Error processing Word file: {e}")
        return False


#Main Execution Logic for Colab

def process_file_in_colab():

    print("Multi-Format Document Text Highlighter")

    # 1. FILE UPLOAD (Colab Method)
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded. Exiting.")
        return

    # Get the name of the first uploaded file
    input_file_name = list(uploaded.keys())[0]
    print(f"File '{input_file_name}' uploaded successfully.")

    # 2. TEXT INPUT
    search_text = input("Enter the specific string of text to be searched and highlighted: ").strip()

    if not search_text:
        print("Search text cannot be empty.")
        return

    # 3. FILE PROCESSING

    file_extension = os.path.splitext(input_file_name)[1].lower()
    base_name = os.path.splitext(input_file_name)[0]
    output_filename = f"{base_name}_highlighted{file_extension}"

    print(f"\n--- Processing '{input_file_name}' (Type: {file_extension.upper()}) ---")

    success = False

    if file_extension == '.pdf':
        success = highlight_text_in_pdf(input_file_name, search_text, output_filename)

    elif file_extension in ['.png', '.jpg', '.jpeg']:
        success = highlight_text_in_image(input_file_name, search_text, output_filename)

    elif file_extension == '.xlsx':
        success = highlight_text_in_excel(input_file_name, search_text, output_filename)

    elif file_extension == '.docx':
        success = highlight_text_in_word(input_file_name, search_text, output_filename)

    else:
        print(f"File type '{file_extension}' is not supported.")

    # 4. AUTO DOWNLOAD (Colab Method)
    if success:
        print("\n--- Download Output ---")
        print(f"Downloading '{output_filename}'...")
        files.download(output_filename)
        print("Download initiated! Check your browser's download folder.")
    else:
        print("\nProcessing finished, but no output file was generated for download.")

if __name__ == "__main__":
    process_file_in_colab()

Welcome to the Multi-Format Colab Document Text Highlighter! üöÄ
----------------------------------------------------------------


Saving Instruction Sheet_AuditRAM.pdf to Instruction Sheet_AuditRAM (5).pdf
File 'Instruction Sheet_AuditRAM (5).pdf' uploaded successfully.
Enter the specific string of text to be searched and highlighted: The goal of this assignment is to create a Python program that takes a file and a text string as input, searches for the text within the file, and then generates an output file/view where the found text is highlighted

--- Processing 'Instruction Sheet_AuditRAM (5).pdf' (Type: .PDF) ---
‚úÖ PDF Success! Found 4 instances. Output saved locally in Colab.

--- Download Output ---
Downloading 'Instruction Sheet_AuditRAM (5)_highlighted.pdf'...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Download initiated! Check your browser's download folder.


# Task
Please manually open and check the `Instruction Sheet_AuditRAM (1)_highlighted.pdf` file that was downloaded to your local machine. Confirm that the text 'auditram' is correctly highlighted within the document. Let me know if you encounter any issues or if the highlighting is not as expected.

## Verify Downloaded PDF

### Subtask:
Manually open and check the `Instruction Sheet_AuditRAM (1)_highlighted.pdf` file on your local machine to ensure the text 'auditram' is correctly highlighted.


## Verify Downloaded PDF

### Subtask:
Manually open and check the `Instruction Sheet_AuditRAM (1)_highlighted.pdf` file on your local machine to ensure the text 'auditram' is correctly highlighted.

#### Instructions
1. Locate the file named `Instruction Sheet_AuditRAM (1)_highlighted.pdf` in your browser's download folder or wherever you saved it.
2. Open the PDF file using your preferred PDF viewer.
3. Carefully review the document to confirm that all instances of the text 'auditram' are highlighted in red.
4. If you encounter any issues (e.g., the file is not found, cannot be opened, or the highlighting is incorrect or missing), please describe the problem in detail.

## Final Task

### Subtask:
Summarize the verification outcome and offer further assistance if needed.


## Summary:

### Q&A
The agent did not directly confirm if the text 'auditram' is correctly highlighted within the document, as this requires manual inspection of a local file. Instead, the agent provided instructions for the user to perform this verification.

### Data Analysis Key Findings
*   The agent successfully generated detailed, step-by-step instructions for a manual verification process.
*   The instructions clearly outline how to locate, open, and review the `Instruction Sheet_AuditRAM (1)_highlighted.pdf` file, specifically requesting confirmation that all instances of 'auditram' are highlighted in red.
*   The agent considered its programmatic subtask complete after providing these instructions, deferring the actual file content verification to the user due to its inability to access local files.

### Insights or Next Steps
*   The user must now manually follow the provided instructions to verify the PDF file's highlighting.
*   The user should report back the findings of the manual verification, including any discrepancies or issues encountered, to complete the overall task.
