In [2]:
import fitz  # PyMuPDF for PDF processing
from PIL import Image
import easyocr
import io

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'], gpu=True)  # Use gpu=True for faster processing if GPU is available

# Step 1: Extract text and images from the PDF
def extract_text_and_images(pdf_path):
    pdf = fitz.open(pdf_path)
    text_content = ""
    images = []

    for page_number in range(len(pdf)):
        page = pdf[page_number]
        # Extract text from the page
        text_content += page.get_text() + "\n"

        # Extract images
        for img_index, image in enumerate(page.get_images(full=True)):
            xref = image[0]  # XREF table reference
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            img = Image.open(io.BytesIO(image_bytes))
            images.append(img)
            print(f"Extracted image {img_index + 1} from page {page_number + 1}")

    return text_content, images

# Step 2: Perform OCR on images with EasyOCR
def extract_text_from_image(image):
    # Convert PIL image to bytes for EasyOCR
    image_bytes = io.BytesIO()
    image.save(image_bytes, format="PNG")
    image_bytes.seek(0)
    
    # EasyOCR processing
    result = reader.readtext(image_bytes.getvalue(), detail=0)
    return " ".join(result)

# Step 3: Process the PDF and save the extracted content
def process_pdf_with_easyocr(pdf_path, output_txt_path):
    # Step 1: Extract text and images
    text, images = extract_text_and_images(pdf_path)

    # Step 2: Perform OCR on images
    for img in images:
        text += "\n" + extract_text_from_image(img)

    # Step 3: Save text to a file
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write(text)

# Usage example
pdf_path = "PDF for Testing\Blockchain and Cybersecurity.pdf"
output_txt_path = "summary_easyocr.txt"
process_pdf_with_easyocr(pdf_path, output_txt_path)
print(f"Extracted text saved to {output_txt_path}")

Using CPU. Note: This module is much faster with a GPU.


Extracted image 1 from page 1
Extracted image 2 from page 1
Extracted image 3 from page 1
Extracted image 4 from page 1
Extracted image 5 from page 1
Extracted image 6 from page 1
Extracted image 7 from page 1
Extracted image 8 from page 1
Extracted image 1 from page 2
Extracted image 2 from page 2
Extracted image 3 from page 2
Extracted image 4 from page 2
Extracted image 5 from page 2
Extracted image 6 from page 2
Extracted image 7 from page 2
Extracted image 1 from page 3
Extracted image 2 from page 3
Extracted image 3 from page 3
Extracted image 4 from page 3
Extracted image 5 from page 3
Extracted image 6 from page 3
Extracted image 7 from page 3
Extracted image 1 from page 4
Extracted image 2 from page 4
Extracted image 3 from page 4
Extracted image 4 from page 4
Extracted image 5 from page 4
Extracted image 6 from page 4
Extracted image 7 from page 4
Extracted image 1 from page 5
Extracted image 2 from page 5
Extracted image 3 from page 5
Extracted image 4 from page 5
Extracted 

KeyboardInterrupt: 