In [1]:
import os
import pdfplumber
import fitz  # PyMuPDF
import pandas as pd

# Directories
pdf_folder = 'data/'
text_output_folder = 'output/text/'
images_output_folder = 'output/images/'
tables_output_folder = 'output/tables/'

# Ensure output folders exist
os.makedirs(text_output_folder, exist_ok=True)
os.makedirs(images_output_folder, exist_ok=True)
os.makedirs(tables_output_folder, exist_ok=True)

# Function to extract text, images, and tables from a PDF
def process_pdf(pdf_path, pdf_name):
    # Extract text and tables using pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""
            
            # Extract tables
            tables = page.extract_tables()
            for i, table in enumerate(tables):
                df = pd.DataFrame(table[1:], columns=table[0])
                df.to_csv(os.path.join(tables_output_folder, f"{pdf_name}_table_{page.page_number + 1}_{i}.csv"), index=False)
    
    # Save text
    with open(os.path.join(text_output_folder, f"{pdf_name}.txt"), 'w', encoding='utf-8') as text_file:
        text_file.write(text)
    
    # Extract images using PyMuPDF
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = os.path.join(images_output_folder, f"{pdf_name}_page_{page_num + 1}_img_{img_index + 1}.png")
            with open(image_filename, 'wb') as img_file:
                img_file.write(image_bytes)
    
    pdf_document.close()

# Process all PDFs in the folder
for pdf_name in os.listdir(pdf_folder):
    if pdf_name.endswith('.pdf'):
        process_pdf(os.path.join(pdf_folder, pdf_name), os.path.splitext(pdf_name)[0])


In [1]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os

# Set path to the Tesseract executable if it's not in your PATH
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Directories
pdf_path = "data/anatomy_vol_1.pdf"
text_output_folder = 'output/text/'
os.makedirs(text_output_folder, exist_ok=True)
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

# Convert PDF to images
images = convert_from_path(pdf_path)

# Extract text from each image using OCR
text = ""
for i, image in enumerate(images):
    text += pytesseract.image_to_string(image) + "\n"  # Extract text from image and append to text

# Save text to file
with open(os.path.join(text_output_folder, f"{pdf_name}.txt"), 'w', encoding='utf-8') as text_file:
    text_file.write(text)

print(f"Text extracted and saved to {pdf_name}.txt")


Text extracted and saved to anatomy_vol_1.txt


In [2]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os

# Set path to the Tesseract executable if it's not in your PATH
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Directories
pdf_path = "data/anatomy_vol_2.pdf"
text_output_folder = 'output/text/'
os.makedirs(text_output_folder, exist_ok=True)
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

# Convert PDF to images
images = convert_from_path(pdf_path)

# Extract text from each image using OCR
text = ""
for i, image in enumerate(images):
    text += pytesseract.image_to_string(image) + "\n"  # Extract text from image and append to text

# Save text to file
with open(os.path.join(text_output_folder, f"{pdf_name}.txt"), 'w', encoding='utf-8') as text_file:
    text_file.write(text)

print(f"Text extracted and saved to {pdf_name}.txt")


Text extracted and saved to anatomy_vol_2.txt
