In [1]:
import fitz  # PyMuPDF
import os
import base64
from PIL import Image
import io
from dotenv import load_dotenv
import time  # Import time for potential rate limiting pauses

# LangChain imports
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI

# --- Configuration ---
# Load environment variables (for API key)
load_dotenv()

# Ensure you have GOOGLE_API_KEY set in your .env file or environment variables
# Get your key from Google AI Studio: https://aistudio.google.com/app/apikey
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in environment variables. Please set it in your .env file.")

# <<< SET YOUR PDF FILE PATH AND PAGE RANGE HERE >>>
pdf_file = "CourseBook_Semester2_AlTafsir.pdf"        # Path to your input PDF
output_file = "output_ocr_CourseBook_Semester2_AlTafsir.md"  # Output file (Markdown format recommended)
start_page = 20               # Page number to start processing (inclusive, 1-based index)
end_page = 30                # Page number to end processing (inclusive, 1-based index)
                             # Set to 0 or leave empty to process until the last page

# --- Helper Functions ---

def image_to_base64(image: Image.Image, format="JPEG") -> str:
    """Converts a PIL Image to a base64 encoded string."""
    # Handle images with alpha channels (e.g., RGBA in PNG)
    if image.mode == 'RGBA':
        # Create a white background image
        bg = Image.new('RGB', image.size, (255, 255, 255))
        # Paste the RGBA image onto the white background
        bg.paste(image, (0, 0), image)
        image = bg
    elif image.mode == 'P': # Handle Palette mode images
        image = image.convert('RGB')

    buffered = io.BytesIO()
    image.save(buffered, format=format)
    img_byte = buffered.getvalue()
    img_base64 = base64.b64encode(img_byte).decode('utf-8')
    return img_base64

def get_ocr_text_from_image(image_base64: str, api_key: str) -> str:
    """
    Sends an image to Gemini model and asks specifically for OCR text extraction.
    """
    # Initialize the LLM (consider adding retry logic for production)
    try:
        llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=api_key)
    except Exception as e:
        print(f"Error initializing Google AI Client: {e}")
        return "[LLM Initialization Failed]"

    message = HumanMessage(
        content=[
            {
                "type": "text",
                # *** Enhanced OCR-Focused Prompt with Hierarchy Preservation ***
                "text": "Perform detailed OCR on this image while carefully preserving the document's structure. "
                    "Please analyze: "
                    "1. Text hierarchy - identify main titles (usually larger or colored text like red/blue), "
                    "2. Subheadings (medium-sized or distinctively colored text), "
                    "3. Body text (standard paragraphs), "
                    "4. Numbered or bulleted lists, "
                    "5. Special formatting (colored text, highlighted sections, quotes in different colors like green), "
                    "6. Visual indicators of importance (boxes, background colors, icons). "
                    "Format the output in markdown with appropriate heading levels (# for main titles, ## for subtitles, etc.). "
                    "Preserve all numbering, bullet points, and paragraph structures exactly as they appear. "
                    "If quoted text appears in a distinct color (like green in religious texts), preserve it with appropriate markdown. "
                    "Extract ALL text exactly as written. "
                    "If there is no text, respond with only '[No text found in image]'. "
                    "Do not describe the image content - only return properly formatted extracted text with its hierarchy maintained.",
            },
            {
                "type": "image_url",
                "image_url": f"data:image/jpeg;base64,{image_base64}"  # Assumes JPEG format after conversion
            },
        ]
    )
    try:
        # Add a small delay to potentially avoid rate limits
        time.sleep(1.5) # Adjusted delay slightly
        response = llm.invoke([message])
        # Basic check if the model indicates no text was found
        if '[no text found in image]' in response.content.lower():
            return "[No text found in image]"
        return response.content
    except Exception as e:
        # Handle potential API errors (rate limits, etc.)
        print(f"Error calling Gemini API for OCR: {e}")
        # Consider more specific error handling (e.g., retries for rate limits)
        return f"[Image OCR Failed: {e}]" # Include error message for debugging

def process_pdf_page_with_ocr(page: fitz.Page, llm_api_key: str) -> list[str]:
    """
    Processes a single PDF page using OCR only.
    Returns a list of OCR text results from all images on the page.
    """
    ocr_results = []

    # Extract Images and Perform OCR
    image_list = page.get_images(full=True)
    print(f"Page {page.number + 1}: Found {len(image_list)} images.")

    for img_index, img_info in enumerate(image_list):
        xref = img_info[0]
        base_image = page.parent.extract_image(xref)
        if not base_image:
            print(f"  - Skipping image {img_index + 1} (extraction failed for xref {xref})")
            continue

        image_bytes = base_image.get("image")
        if not image_bytes:
             print(f"  - Skipping image {img_index + 1} (no image data found for xref {xref})")
             continue

        try:
            # Load image with PIL
            pil_image = Image.open(io.BytesIO(image_bytes))

            # Convert to a format Gemini likes (JPEG preferred, handle transparency/palette)
            img_base64 = image_to_base64(pil_image, format="JPEG")

            # Get OCR Text from Image
            print(f"  - Performing OCR on image {img_index + 1} (xref {xref})...")
            ocr_text = get_ocr_text_from_image(img_base64, llm_api_key)
            print(f"  - OCR Result received for image {img_index + 1}.")

            if ocr_text.strip() and "[No text found in image]" not in ocr_text:
                ocr_results.append(ocr_text.strip())

        except Exception as e:
            print(f"  - Error processing image {img_index + 1} (xref {xref}): {e}")

    return ocr_results

# --- Main Workflow ---

def pdf_to_ocr_knowledge_base(pdf_path: str, output_path: str, start_pg: int, end_pg: int, api_key: str):
    """
    Main workflow: Extracts text using OCR only and creates a clean knowledge base output.
    """
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file not found at {pdf_path}")
        return
    if not api_key:
        print(f"Error: GOOGLE_API_KEY is missing.")
        return

    print(f"Processing PDF: {pdf_path}")
    doc = fitz.open(pdf_path)
    final_output = ""
    total_pages = len(doc)

    # Validate Page Range
    actual_start_index = max(0, start_pg - 1)
    actual_end_index = (end_pg if end_pg > 0 else total_pages) - 1
    actual_end_index = min(actual_end_index, total_pages - 1)

    if actual_start_index > actual_end_index:
        print(f"Error: Start page ({start_pg}) > End page ({end_pg}). Please check your page range.")
        doc.close()
        return
    
    print(f"Extracting text using OCR (Pages {actual_start_index + 1}-{actual_end_index + 1})")

    # Add knowledge base header
    final_output += f"# Knowledge Base - {os.path.basename(pdf_path)}\n\n"

    # --- Process Specified Page Range ---
    for page_num in range(actual_start_index, actual_end_index + 1):
        current_page_num = page_num + 1
        print(f"\n--- Processing Page {current_page_num} ---")
        try:
            page = doc.load_page(page_num)
            ocr_results = process_pdf_page_with_ocr(page, api_key)

            # Add OCR content if found
            if ocr_results:
                for ocr_text in ocr_results:
                    final_output += f"{ocr_text}\n\n"
            else:
                print(f"  - No text content found on page {current_page_num}")

        except Exception as e:
            print(f"An error occurred while processing page {current_page_num}: {e}")

    doc.close()

    # --- Save Output ---
    try:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(final_output)
        print(f"\nSuccessfully processed PDF pages {actual_start_index + 1}-{actual_end_index + 1}.")
        print(f"OCR knowledge base saved to: {output_path}")
    except IOError as e:
        print(f"Error writing output file {output_path}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during file writing: {e}")


# --- Execution ---
if __name__ == "__main__":
    # Ensure API key is available before running
    if GOOGLE_API_KEY:
        pdf_to_ocr_knowledge_base(pdf_file, output_file, start_page, end_page, GOOGLE_API_KEY)
    else:
        print("Processing stopped due to missing Google API Key.")

  from .autonotebook import tqdm as notebook_tqdm


Processing PDF: CourseBook_Semester2_AlTafsir.pdf
Extracting text using OCR (Pages 20-30)

--- Processing Page 20 ---
Page 20: Found 1 images.
  - Performing OCR on image 1 (xref 174)...
  - OCR Result received for image 1.

--- Processing Page 21 ---
Page 21: Found 1 images.
  - Performing OCR on image 1 (xref 177)...
  - OCR Result received for image 1.

--- Processing Page 21 ---
Page 21: Found 1 images.
  - Performing OCR on image 1 (xref 177)...
  - OCR Result received for image 1.

--- Processing Page 22 ---
Page 22: Found 1 images.
  - Performing OCR on image 1 (xref 182)...
  - OCR Result received for image 1.

--- Processing Page 22 ---
Page 22: Found 1 images.
  - Performing OCR on image 1 (xref 182)...
  - OCR Result received for image 1.

--- Processing Page 23 ---
Page 23: Found 1 images.
  - Performing OCR on image 1 (xref 185)...
  - OCR Result received for image 1.

--- Processing Page 23 ---
Page 23: Found 1 images.
  - Performing OCR on image 1 (xref 185)...
  - OCR R