In [1]:
import os
import fitz  # PyMuPDF
import io
import base64
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from PIL import Image
from dotenv import load_dotenv
import time
import random
import google.generativeai as genai

# Load environment variables
load_dotenv()

# Retrieve the Gemini API key
gemini_api_key = os.getenv("GEMINI_API_KEY")

if not gemini_api_key:
    raise EnvironmentError("GEMINI_API_KEY is not set. Please add it to your .env file.")

# Configure the Gemini API
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel("gemini-1.5-flash")  # Initialize the Gemini model

def pdf_to_base64_images(pdf_path):
    """
    Converts a PDF into Base64-encoded images for all pages.
    """
    try:
        pdf_document = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF {pdf_path}: {e}")
        return []

    def process_page(page):
        try:
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            buffered = io.BytesIO()
            img.save(buffered, format="PNG")
            return base64.b64encode(buffered.getvalue()).decode()
        except Exception as e:
            print(f"Error processing page {page.number + 1}: {e}")
            return None

    with ThreadPoolExecutor() as executor:
        base64_images = list(executor.map(process_page, [pdf_document[i] for i in range(len(pdf_document))]))

    pdf_document.close()
    return [img for img in base64_images if img is not None]

def process_summary_from_image(base64_str, max_retries=3, retry_delay=5):
    """
    Uses Gemini to transcribe text from a Base64-encoded image.
    """
    text_prompt =  """
    You will be given an image containing text. Your task is to accurately transcribe all the text from this image. 
    Pay special attention to names, tables and numbers.

    Follow these steps to complete the task:
    1. Carefully examine the entire image.
    2. Transcribe all visible text exactly as it appears in the image.
    3. If any text is unclear or illegible, do not attempt to guess or fill in information. Instead, indicate unclear text with [unclear] in your transcription.
    4. Pay particular attention to visual elements such as tables, charts, and diagrams. Ensure these are transcribed accurately and in a clear, organized manner.
    5. If the order of information in the image is not clear, think step by step about the logical flow of the content. Arrange the transcribed information in a relevant and coherent order.
    6. Do not add any information that is not present in the image.
    7. Do not include any preamble or explanation about the transcription process in your response.
    8. For Visual Elements:
        a. For tables: Transcribe headers, rows, and columns in a markdown table format, ensuring proper alignment and structure.
        b. For charts or diagrams: Provide a detailed description of the type (e.g., bar chart, flowchart), layout, and any labeled data points.
        Example Markdown Table:
        | Column 1 Header | Column 2 Header | Column 3 Header |
        |---------------- |-----------------|-----------------|
        | Row 1, Cell 1   | Row 1, Cell 2   | Row 1, Cell 3   |
        | Row 2, Cell 1   | Row 2, Cell 2   | Row 2, Cell 3   |
        | Row 3, Cell 1   | Row 3, Cell 2   | Row 3, Cell 3   |
    9. Your response should only contain the transcribed content from the image, organized in a logical manner if necessary.
    10. If you encounter any issues or if the image is not clear enough to transcribe, explain the problem instead of providing a transcription.
    """

    for attempt in range(max_retries):
        try:
            response = model.generate_content([
                text_prompt,  # Include a text prompt as required by the API
                {
                    "mime_type": "image/png",
                    "data": base64.b64decode(base64_str)
                }
            ])
            time.sleep(random.uniform(0.5, 1.5))  # Introduce a slight delay to avoid rate limits
            return response.text

        except Exception as e:
            print(f"APIError: {e}. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    print("Max retries reached. Unable to process the image.")
    return None

def split_image(image):
    """
    Splits a large image into smaller parts to avoid API issues.
    """
    width, height = image.size
    max_height = 1024  # Set a reasonable height limit
    parts = []

    for top in range(0, height, max_height):
        box = (0, top, width, min(top + max_height, height))
        part = image.crop(box)
        buffered = io.BytesIO()
        part.save(buffered, format="PNG")
        parts.append(base64.b64encode(buffered.getvalue()).decode())

    return parts

def process_pdf_to_text(pdf_path):
    """
    Complete pipeline for processing a PDF file:
    1. Convert PDF pages to images.
    2. Split images if needed.
    3. Transcribe images to text using Gemini (parallel processing).
    """
    print(f"Processing PDF: {pdf_path}")

    base64_images = pdf_to_base64_images(pdf_path)
    print(f"Converted {len(base64_images)} pages to Base64 images.")

    if not base64_images:
        print(f"No images to process for PDF {pdf_path}. Skipping transcription.")
        return []

    transcriptions = []

    def transcribe(index, base64_image):
        try:
            decoded_image = Image.open(io.BytesIO(base64.b64decode(base64_image)))
            image_parts = split_image(decoded_image) if decoded_image.size[1] > 1024 else [base64_image]

            page_transcription = []
            for part in image_parts:
                transcription = process_summary_from_image(part)
                if transcription:
                    page_transcription.append(transcription)

            return index, "\n".join(page_transcription)
        except Exception as e:
            print(f"Error processing page {index + 1}: {e}")
            return index, None

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(transcribe, i, img): i for i, img in enumerate(base64_images)}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Transcribing pages"):
            index, transcription = future.result()
            if transcription:
                transcriptions.append((index, transcription))

    transcriptions.sort()
    return [t[1] for t in transcriptions]

if __name__ == "__main__":
    pdf_directory = "pdfs1"  
    output_directory = "output"  
    os.makedirs(output_directory, exist_ok=True)

    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        transcriptions = process_pdf_to_text(pdf_path)

        if not transcriptions:
            print(f"No transcriptions generated for {pdf_file}. Skipping saving.")
            continue

        output_file = os.path.join(output_directory, f"{os.path.splitext(pdf_file)[0]}_transcription.txt")
        with open(output_file, "w", encoding="utf-8") as f:
            for i, transcription in enumerate(transcriptions, start=1):
                f.write(f"### Page {i}\n{transcription}\n\n")

        print(f"Transcriptions saved to {output_file}")


  from .autonotebook import tqdm as notebook_tqdm


Processing PDF: pdfs1\3M_2015_10K.pdf
Converted 158 pages to Base64 images.


Transcribing pages:  31%|███       | 49/158 [01:18<02:30,  1.38s/it]

APIError: 500 Internal error encountered.. Retrying in 5 seconds...
APIError: 500 Internal error encountered.. Retrying in 5 seconds...


Transcribing pages:  33%|███▎      | 52/158 [01:24<02:36,  1.48s/it]

APIError: 500 Internal error encountered.. Retrying in 5 seconds...


Transcribing pages:  75%|███████▌  | 119/158 [03:23<01:01,  1.57s/it]

APIError: 500 Internal error encountered.. Retrying in 5 seconds...


Transcribing pages: 100%|██████████| 158/158 [04:19<00:00,  1.65s/it]


Transcriptions saved to output\3M_2015_10K_transcription.txt
Processing PDF: pdfs1\3M_2016_10K.pdf
Converted 233 pages to Base64 images.


Transcribing pages:  15%|█▍        | 34/233 [00:58<03:58,  1.20s/it]

APIError: 500 Internal error encountered.. Retrying in 5 seconds...


Transcribing pages:  62%|██████▏   | 144/233 [03:58<01:40,  1.13s/it]

APIError: 500 Internal error encountered.. Retrying in 5 seconds...


Transcribing pages: 100%|██████████| 233/233 [05:40<00:00,  1.46s/it]

Transcriptions saved to output\3M_2016_10K_transcription.txt



