In [1]:
pip install PyPDF2 pytesseract pillow

Note: you may need to restart the kernel to use updated packages.


In [3]:
import PyPDF2
import pytesseract
from PIL import Image
import io

# Set the path to the Tesseract executable if it's not in your system PATH
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust this path as needed


In [20]:
import os
import PyPDF2
import pytesseract
from PIL import Image, ExifTags
import io

def extract_data_from_pdf(pdf_path, output_folder):
    markdown_content = []

    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = len(reader.pages)

            for page_num in range(num_pages):
                page = reader.pages[page_num]

                # Extract text
                text = page.extract_text()
                if text:
                    markdown_content.append(f"## Page {page_num + 1}\n\n{text}\n\n")
                else:
                    markdown_content.append(f"## Page {page_num + 1}\n\nNo text extracted.\n\n")

                # Extract images
                if '/XObject' in page['/Resources']:
                    xObject = page['/Resources']['/XObject'].get_object()
                    for obj in xObject:
                        if xObject[obj]['/Subtype'] == '/Image':
                            # Determine image format (usually `/Filter` is set to something like `/DCTDecode` or `/JPXDecode`)
                            image_data = xObject[obj].get_data()
                            image_format = xObject[obj]['/Filter']
                            
                            # Define image extension based on the format
                            if image_format == '/DCTDecode':  # JPEG
                                extension = 'jpg'
                            elif image_format == '/JPXDecode':  # JPEG2000
                                extension = 'jp2'
                            else:
                                extension = 'png'  # Default to PNG

                            # Get image size
                            img = Image.open(io.BytesIO(image_data))

                            # Check for EXIF orientation (for images like JPG)
                            try:
                                for orientation in ExifTags.TAGS.keys():
                                    if ExifTags.TAGS[orientation]=='Orientation':
                                        exif=dict(img._getexif() or [])
                                        if exif.get(orientation)==3:
                                            img=img.rotate(180, expand=True)
                                        elif exif.get(orientation)==6:
                                            img=img.rotate(270, expand=True)
                                        elif exif.get(orientation)==8:
                                            img=img.rotate(90, expand=True)
                            except (AttributeError, KeyError, IndexError):
                                # If no EXIF data is found, we just ignore the error
                                pass

                            # Create a safe filename and ensure no invalid characters
                            image_filename = f"page_{page_num + 1}_image_{obj}.{extension}"
                            image_filename = ''.join(e for e in image_filename if e.isalnum() or e in ('-', '_', '.'))
                            image_path = os.path.join(output_folder, image_filename)

                            # Ensure output directory exists for image saving
                            os.makedirs(output_folder, exist_ok=True)

                            # Save the image
                            img.save(image_path)

                            # Log image saving
                            markdown_content.append(f"### Image {obj} saved as {image_filename}\n\n")

                            # Perform OCR on the image
                            image_text = pytesseract.image_to_string(img)
                            markdown_content.append(f"### Image Text from {image_filename}\n\n{image_text}\n\n")

    except Exception as e:
        return f"Failed to extract data from PDF: {e}"

    # Save markdown content to a file
    markdown_file = os.path.join(output_folder, 'extracted_data.md')
    with open(markdown_file, 'w', encoding='utf-8') as md_file:
        md_file.writelines(markdown_content)

    return f"Data extracted and saved in {output_folder}"

# Usage
pdf_path = './pdf_extract_base.pdf'
output_folder = './extracted_data'
result = extract_data_from_pdf(pdf_path, output_folder)
print(result)


Data extracted and saved in ./extracted_data
