In [None]:
# %% [markdown]
# # PDF Figure and Table Extractor
# This notebook extracts all Figures and Tables from a PDF and saves them as PNG images.
# - Figures are saved in a `Figures/` folder
# - Tables are saved in a `Tables/` folder

# %% Cell 1: Install Required Libraries
# Run this cell to install all necessary dependencies

import subprocess
import sys

def install_packages():
    """Install required packages"""
    packages = [
        'pdfplumber',
        'pdf2image',
        'pymupdf',  # This is fitz
        'Pillow',
        'regex'
    ]
    for package in packages:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q', '--break-system-packages'])
    print("‚úÖ All packages installed successfully!")

install_packages()

‚úÖ All packages installed successfully!


In [None]:
# %% Cell 2: Import Libraries

import os
import re
import fitz  # PyMuPDF
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image
from pathlib import Path

print("‚úÖ All libraries imported successfully!")

‚úÖ All libraries imported successfully!


In [None]:
# %% Cell 3: Configuration
# Set your PDF file path here

# === CONFIGURE THIS ===
PDF_PATH = "riscv-spec-20191213.pdf"  # Change this to your PDF file path
OUTPUT_DIR = "output"   # Base output directory
# ======================

# Create output directories
FIGURES_DIR = os.path.join(OUTPUT_DIR, "Figures")
TABLES_DIR = os.path.join(OUTPUT_DIR, "Tables")

os.makedirs(FIGURES_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)

print(f"üìÅ Output directories created:")
print(f"   - Figures: {FIGURES_DIR}")
print(f"   - Tables: {TABLES_DIR}")

üìÅ Output directories created:
   - Figures: output/Figures
   - Tables: output/Tables


In [None]:
# %% Cell 4: Helper Functions for Finding Figures and Tables

def find_figure_table_captions(pdf_path):
    """
    Find all Figure and Table captions with their page numbers and positions.
    Returns a list of dictionaries with caption info.
    """
    captions = []

    # Patterns to match various figure/table naming conventions
    # Matches: Figure 1, Figure 1.1, Figure 1.1.1, Figure 1-1, Fig. 1, Fig 1.1, etc.
    figure_pattern = re.compile(
        r'(Fig(?:ure)?\.?\s*(\d+(?:[.\-]\d+)*))',
        re.IGNORECASE
    )

    # Matches: Table 1, Table 1.1, Table 1.1.1, Table 1-1, etc.
    table_pattern = re.compile(
        r'(Table\s*(\d+(?:[.\-]\d+)*))',
        re.IGNORECASE
    )

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Extract words with their positions
            words = page.extract_words()
            text = page.extract_text() or ""

            # Find figure captions
            for match in figure_pattern.finditer(text):
                full_match = match.group(1)
                number = match.group(2)

                # Find position of this caption on the page
                position = find_text_position(words, full_match)

                captions.append({
                    'type': 'figure',
                    'full_caption': full_match,
                    'number': number,
                    'page': page_num,
                    'position': position,
                    'page_height': page.height,
                    'page_width': page.width
                })

            # Find table captions
            for match in table_pattern.finditer(text):
                full_match = match.group(1)
                number = match.group(2)

                # Find position of this caption on the page
                position = find_text_position(words, full_match)

                captions.append({
                    'type': 'table',
                    'full_caption': full_match,
                    'number': number,
                    'page': page_num,
                    'position': position,
                    'page_height': page.height,
                    'page_width': page.width
                })

    return captions


def find_text_position(words, search_text):
    """
    Find the bounding box position of text in the page.
    Returns (x0, y0, x1, y1) or None if not found.
    """
    search_words = search_text.lower().split()
    if not search_words or not words:
        return None

    # Try to find consecutive words matching our search
    for i, word in enumerate(words):
        if search_words[0] in word['text'].lower():
            # Check if subsequent words match
            match_found = True
            x0, y0, x1, y1 = word['x0'], word['top'], word['x1'], word['bottom']

            for j, sw in enumerate(search_words[1:], 1):
                if i + j < len(words):
                    if sw in words[i + j]['text'].lower():
                        x1 = max(x1, words[i + j]['x1'])
                        y1 = max(y1, words[i + j]['bottom'])
                    else:
                        match_found = False
                        break

            if match_found:
                return (x0, y0, x1, y1)

    return None

print("‚úÖ Helper functions defined!")

‚úÖ Helper functions defined!


In [None]:
# %% Cell 5: Functions to Extract Figure/Table Regions

def estimate_figure_region(caption_info, all_captions, page_height, page_width, margin=20):
    """
    Estimate the bounding region of a figure or table based on its caption position.
    Figures typically appear ABOVE their captions.
    Tables may have captions above OR below.
    """
    caption_pos = caption_info['position']
    page_num = caption_info['page']

    if caption_pos is None:
        # If we can't find the caption position, use full page
        return (0, 0, page_width, page_height)

    caption_y = caption_pos[1]  # Top of caption

    # Find the next caption on the same page (if any) to limit our region
    next_caption_y = page_height
    prev_caption_y = 0

    for other in all_captions:
        if other['page'] == page_num and other['position'] is not None:
            other_y = other['position'][1]
            if other_y > caption_y and other_y < next_caption_y:
                next_caption_y = other_y
            if other_y < caption_y and other_y > prev_caption_y:
                prev_caption_y = other_y

    if caption_info['type'] == 'figure':
        # Figure is typically ABOVE its caption
        # Region: from previous caption (or top) to just below current caption
        top = max(0, prev_caption_y)
        bottom = min(page_height, caption_pos[3] + margin)
    else:
        # Table: include area both above and below caption
        top = max(0, prev_caption_y)
        bottom = min(page_height, next_caption_y - margin if next_caption_y < page_height else page_height)

    return (0, top, page_width, bottom)


def get_figures_and_tables_regions(pdf_path):
    """
    Analyze the PDF and return regions for all figures and tables.
    """
    captions = find_figure_table_captions(pdf_path)

    # Remove duplicates (same number on same page)
    seen = set()
    unique_captions = []
    for cap in captions:
        key = (cap['type'], cap['number'], cap['page'])
        if key not in seen:
            seen.add(key)
            unique_captions.append(cap)

    # Estimate regions for each
    regions = []
    for cap in unique_captions:
        region = estimate_figure_region(
            cap,
            unique_captions,
            cap['page_height'],
            cap['page_width']
        )
        regions.append({
            **cap,
            'region': region
        })

    return regions

print("‚úÖ Region extraction functions defined!")

‚úÖ Region extraction functions defined!


In [None]:
# %% Cell 6: Function to Extract and Save Images

def extract_and_save_images(pdf_path, output_figures_dir, output_tables_dir, dpi=200):
    """
    Extract all figures and tables from PDF and save as PNG.
    """
    # Get all regions
    regions = get_figures_and_tables_regions(pdf_path)

    if not regions:
        print("‚ö†Ô∏è No figures or tables found in the PDF!")
        return []

    print(f"üìä Found {len(regions)} figures/tables to extract")

    # Convert PDF pages to images
    print("üîÑ Converting PDF pages to images...")
    pdf_images = convert_from_path(pdf_path, dpi=dpi)
    print(f"   Converted {len(pdf_images)} pages")

    # Open PDF with PyMuPDF for coordinate conversion
    doc = fitz.open(pdf_path)

    saved_files = []

    for item in regions:
        page_num = item['page']
        item_type = item['type']
        number = item['number']
        region = item['region']

        # Get the page image
        if page_num >= len(pdf_images):
            print(f"   ‚ö†Ô∏è Page {page_num + 1} not found, skipping {item['full_caption']}")
            continue

        page_img = pdf_images[page_num]
        img_width, img_height = page_img.size

        # Get PDF page dimensions
        pdf_page = doc[page_num]
        pdf_width = pdf_page.rect.width
        pdf_height = pdf_page.rect.height

        # Scale region coordinates to image coordinates
        scale_x = img_width / pdf_width
        scale_y = img_height / pdf_height

        x0 = int(region[0] * scale_x)
        y0 = int(region[1] * scale_y)
        x1 = int(region[2] * scale_x)
        y1 = int(region[3] * scale_y)

        # Add some padding
        padding = 10
        x0 = max(0, x0 - padding)
        y0 = max(0, y0 - padding)
        x1 = min(img_width, x1 + padding)
        y1 = min(img_height, y1 + padding)

        # Ensure valid crop region
        if x1 <= x0 or y1 <= y0:
            print(f"   ‚ö†Ô∏è Invalid region for {item['full_caption']}, using full page")
            cropped = page_img
        else:
            cropped = page_img.crop((x0, y0, x1, y1))

        # Generate filename
        # Replace dots and dashes with underscores for consistent naming
        safe_number = number.replace('-', '.').replace(' ', '')

        if item_type == 'figure':
            filename = f"figure_{safe_number}.png"
            output_path = os.path.join(output_figures_dir, filename)
        else:
            filename = f"table_{safe_number}.png"
            output_path = os.path.join(output_tables_dir, filename)

        # Save the cropped image
        cropped.save(output_path, 'PNG')
        saved_files.append({
            'type': item_type,
            'number': number,
            'filename': filename,
            'path': output_path,
            'page': page_num + 1
        })
        print(f"   ‚úÖ Saved: {filename} (from page {page_num + 1})")

    doc.close()
    return saved_files

print("‚úÖ Extraction function defined!")

‚úÖ Extraction function defined!


In [None]:
# %% Cell 7: Alternative Method - Using PyMuPDF for Better Figure Detection

def extract_with_pymupdf(pdf_path, output_figures_dir, output_tables_dir, dpi=200):
    """
    Alternative extraction using PyMuPDF with better handling of embedded images.
    This method also extracts embedded images directly.
    """
    doc = fitz.open(pdf_path)

    # First, find all captions
    captions = find_figure_table_captions(pdf_path)

    # Remove duplicates
    seen = set()
    unique_captions = []
    for cap in captions:
        key = (cap['type'], cap['number'], cap['page'])
        if key not in seen:
            seen.add(key)
            unique_captions.append(cap)

    if not unique_captions:
        print("‚ö†Ô∏è No figures or tables found!")
        return []

    print(f"üìä Found {len(unique_captions)} figures/tables")

    saved_files = []

    for item in unique_captions:
        page_num = item['page']
        item_type = item['type']
        number = item['number']

        page = doc[page_num]

        # Estimate the region
        region = estimate_figure_region(
            item,
            unique_captions,
            item['page_height'],
            item['page_width']
        )

        # Convert region to PyMuPDF rect
        clip_rect = fitz.Rect(region[0], region[1], region[2], region[3])

        # Render the clipped region at higher resolution
        zoom = dpi / 72  # 72 is the default PDF resolution
        mat = fitz.Matrix(zoom, zoom)

        # Get pixmap of the region
        pix = page.get_pixmap(matrix=mat, clip=clip_rect)

        # Convert to PIL Image
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Generate filename
        safe_number = number.replace('-', '.').replace(' ', '')

        if item_type == 'figure':
            filename = f"figure_{safe_number}.png"
            output_path = os.path.join(output_figures_dir, filename)
        else:
            filename = f"table_{safe_number}.png"
            output_path = os.path.join(output_tables_dir, filename)

        # Save
        img.save(output_path, 'PNG')
        saved_files.append({
            'type': item_type,
            'number': number,
            'filename': filename,
            'path': output_path,
            'page': page_num + 1
        })
        print(f"   ‚úÖ Saved: {filename} (from page {page_num + 1})")

    doc.close()
    return saved_files

print("‚úÖ PyMuPDF extraction function defined!")

‚úÖ PyMuPDF extraction function defined!


In [None]:
# %% Cell 8: Full Page Extraction Method (Backup)

def extract_full_pages_with_annotations(pdf_path, output_figures_dir, output_tables_dir, dpi=200):
    """
    Extract full pages containing figures/tables.
    Use this if the region-based extraction doesn't work well.
    """
    captions = find_figure_table_captions(pdf_path)

    # Remove duplicates
    seen = set()
    unique_captions = []
    for cap in captions:
        key = (cap['type'], cap['number'], cap['page'])
        if key not in seen:
            seen.add(key)
            unique_captions.append(cap)

    if not unique_captions:
        print("‚ö†Ô∏è No figures or tables found!")
        return []

    # Convert all pages to images
    print("üîÑ Converting PDF to images...")
    pdf_images = convert_from_path(pdf_path, dpi=dpi)

    saved_files = []

    for item in unique_captions:
        page_num = item['page']
        item_type = item['type']
        number = item['number']

        if page_num >= len(pdf_images):
            continue

        page_img = pdf_images[page_num]

        # Generate filename
        safe_number = number.replace('-', '.').replace(' ', '')

        if item_type == 'figure':
            filename = f"figure_{safe_number}.png"
            output_path = os.path.join(output_figures_dir, filename)
        else:
            filename = f"table_{safe_number}.png"
            output_path = os.path.join(output_tables_dir, filename)

        # Save full page
        page_img.save(output_path, 'PNG')
        saved_files.append({
            'type': item_type,
            'number': number,
            'filename': filename,
            'path': output_path,
            'page': page_num + 1
        })
        print(f"   ‚úÖ Saved: {filename} (full page {page_num + 1})")

    return saved_files

print("‚úÖ Full page extraction function defined!")

‚úÖ Full page extraction function defined!


In [None]:
# %% Cell 9: Smart Extraction with Visual Element Detection

def smart_extract_figures_tables(pdf_path, output_figures_dir, output_tables_dir, dpi=300):
    """
    Smart extraction that tries to detect actual figure/table boundaries
    using visual analysis and text positioning.
    """
    doc = fitz.open(pdf_path)

    # Find all captions
    captions = find_figure_table_captions(pdf_path)

    # Remove duplicates and sort by page and position
    seen = set()
    unique_captions = []
    for cap in captions:
        key = (cap['type'], cap['number'], cap['page'])
        if key not in seen:
            seen.add(key)
            unique_captions.append(cap)

    # Sort by page, then by y position
    unique_captions.sort(key=lambda x: (x['page'], x['position'][1] if x['position'] else 0))

    if not unique_captions:
        print("‚ö†Ô∏è No figures or tables found!")
        return []

    print(f"üìä Found {len(unique_captions)} figures/tables to extract")

    saved_files = []

    for idx, item in enumerate(unique_captions):
        page_num = item['page']
        item_type = item['type']
        number = item['number']
        caption_pos = item['position']

        page = doc[page_num]
        page_rect = page.rect

        # Determine the crop region based on caption position
        if caption_pos:
            caption_y_top = caption_pos[1]
            caption_y_bottom = caption_pos[3]

            # Find boundaries
            top_boundary = 0
            bottom_boundary = page_rect.height

            # Look for previous item on same page
            for prev_item in unique_captions[:idx]:
                if prev_item['page'] == page_num and prev_item['position']:
                    prev_bottom = prev_item['position'][3]
                    if prev_bottom < caption_y_top:
                        top_boundary = max(top_boundary, prev_bottom + 5)

            # Look for next item on same page
            for next_item in unique_captions[idx+1:]:
                if next_item['page'] == page_num and next_item['position']:
                    next_top = next_item['position'][1]
                    if next_top > caption_y_bottom:
                        bottom_boundary = min(bottom_boundary, next_top - 5)
                        break

            if item_type == 'figure':
                # For figures: caption is usually below, so include area above caption
                crop_top = top_boundary
                crop_bottom = caption_y_bottom + 20  # Include caption
            else:
                # For tables: include both caption and table content
                crop_top = max(0, caption_y_top - 30)  # Start just above caption
                crop_bottom = bottom_boundary

            # Full width with small margins
            crop_left = 20
            crop_right = page_rect.width - 20

        else:
            # No position found, use reasonable defaults
            crop_left = 20
            crop_right = page_rect.width - 20
            crop_top = 0
            crop_bottom = page_rect.height

        # Create clip rectangle
        clip_rect = fitz.Rect(crop_left, crop_top, crop_right, crop_bottom)

        # Ensure clip is within page bounds
        clip_rect = clip_rect & page_rect

        # Render at high DPI
        zoom = dpi / 72
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, clip=clip_rect)

        # Convert to PIL
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Skip if image is too small (likely a detection error)
        if img.height < 50 or img.width < 50:
            print(f"   ‚ö†Ô∏è Skipping {item['full_caption']} - detected region too small")
            continue

        # Generate filename
        safe_number = number.replace('-', '.').replace(' ', '')

        if item_type == 'figure':
            filename = f"figure_{safe_number}.png"
            output_path = os.path.join(output_figures_dir, filename)
        else:
            filename = f"table_{safe_number}.png"
            output_path = os.path.join(output_tables_dir, filename)

        # Save
        img.save(output_path, 'PNG', optimize=True)
        saved_files.append({
            'type': item_type,
            'number': number,
            'filename': filename,
            'path': output_path,
            'page': page_num + 1,
            'dimensions': f"{img.width}x{img.height}"
        })
        print(f"   ‚úÖ Saved: {filename} (page {page_num + 1}, {img.width}x{img.height}px)")

    doc.close()
    return saved_files

print("‚úÖ Smart extraction function defined!")

‚úÖ Smart extraction function defined!


In [None]:
# %% Cell 10: Main Execution - Run the Extraction

def main(pdf_path, method='smart'):
    """
    Main function to extract figures and tables.

    Parameters:
    - pdf_path: Path to the PDF file
    - method: 'smart' (recommended), 'pymupdf', 'pdf2image', or 'fullpage'
    """
    # Verify PDF exists
    if not os.path.exists(pdf_path):
        print(f"‚ùå Error: PDF file not found: {pdf_path}")
        return None

    print(f"üìÑ Processing: {pdf_path}")
    print(f"üìÇ Output directories:")
    print(f"   - Figures: {FIGURES_DIR}")
    print(f"   - Tables: {TABLES_DIR}")
    print(f"üîß Method: {method}")
    print("-" * 50)

    if method == 'smart':
        results = smart_extract_figures_tables(pdf_path, FIGURES_DIR, TABLES_DIR)
    elif method == 'pymupdf':
        results = extract_with_pymupdf(pdf_path, FIGURES_DIR, TABLES_DIR)
    elif method == 'pdf2image':
        results = extract_and_save_images(pdf_path, FIGURES_DIR, TABLES_DIR)
    elif method == 'fullpage':
        results = extract_full_pages_with_annotations(pdf_path, FIGURES_DIR, TABLES_DIR)
    else:
        print(f"‚ùå Unknown method: {method}")
        return None

    print("-" * 50)
    print(f"‚úÖ Extraction complete!")
    print(f"   - Figures extracted: {sum(1 for r in results if r['type'] == 'figure')}")
    print(f"   - Tables extracted: {sum(1 for r in results if r['type'] == 'table')}")

    return results

In [None]:
# %% Cell 11: Execute Extraction
# Run this cell to extract figures and tables from your PDF

# Make sure to set PDF_PATH in Cell 3 before running!
if os.path.exists(PDF_PATH):
    results = main(PDF_PATH, method='smart')

    # Display results summary
    if results:
        print("\nüìã Extracted Items Summary:")
        print("=" * 60)
        for r in results:
            print(f"  {r['type'].capitalize()} {r['number']}: {r['filename']} (page {r['page']})")
else:
    print(f"‚ö†Ô∏è Please update PDF_PATH in Cell 3 to point to your PDF file.")
    print(f"   Current value: {PDF_PATH}")

üìÑ Processing: riscv-spec-20191213.pdf
üìÇ Output directories:
   - Figures: output/Figures
   - Tables: output/Tables
üîß Method: smart
--------------------------------------------------
üìä Found 61 figures/tables to extract
   ‚úÖ Saved: figure_1.1.png (page 26, 2384x955px)
   ‚úÖ Saved: table_1.1.png (page 29, 2384x2084px)
   ‚úÖ Saved: figure_2.1.png (page 31, 2384x2583px)
   ‚úÖ Saved: figure_2.1.png (page 32, 2384x2192px)
   ‚úÖ Saved: figure_2.2.png (page 33, 2384x1396px)
   ‚úÖ Saved: figure_2.2.png (page 34, 2384x1010px)
   ‚úÖ Saved: figure_2.3.png (page 34, 2384x897px)
   ‚úÖ Saved: figure_2.4.png (page 35, 2384x688px)
   ‚úÖ Saved: table_2.1.png (page 39, 2384x703px)
   ‚úÖ Saved: table_2.1.png (page 40, 2384x2759px)
   ‚úÖ Saved: table_2.2.png (page 44, 2384x1524px)
   ‚úÖ Saved: table_2.3.png (page 46, 2384x1413px)
   ‚úÖ Saved: table_2.3.png (page 47, 2384x1100px)
   ‚úÖ Saved: figure_2.1.png (page 53, 2384x1908px)
   ‚úÖ Saved: table_5.1.png (page 56, 2384x2370px)

In [None]:
# %% Cell 13: Utility - Preview Extracted Images (Optional)

def preview_images(max_preview=5):
    """
    Display a preview of extracted images (for Jupyter notebooks).
    """
    try:
        from IPython.display import display, HTML
        import base64
        from io import BytesIO

        all_files = []

        if os.path.exists(FIGURES_DIR):
            for f in sorted(os.listdir(FIGURES_DIR))[:max_preview]:
                all_files.append(('Figure', os.path.join(FIGURES_DIR, f)))

        if os.path.exists(TABLES_DIR):
            for f in sorted(os.listdir(TABLES_DIR))[:max_preview]:
                all_files.append(('Table', os.path.join(TABLES_DIR, f)))

        for item_type, filepath in all_files:
            img = Image.open(filepath)

            # Resize for preview
            max_width = 600
            if img.width > max_width:
                ratio = max_width / img.width
                img = img.resize((int(img.width * ratio), int(img.height * ratio)))

            # Convert to base64 for display
            buffer = BytesIO()
            img.save(buffer, format='PNG')
            img_str = base64.b64encode(buffer.getvalue()).decode()

            filename = os.path.basename(filepath)
            html = f"""
            <div style="margin: 10px; padding: 10px; border: 1px solid #ddd; display: inline-block;">
                <h4>{item_type}: {filename}</h4>
                <img src="data:image/png;base64,{img_str}" style="max-width: 100%;">
            </div>
            """
            display(HTML(html))

    except ImportError:
        print("Preview is only available in Jupyter notebooks.")
        print("Use list_extracted_files() to see the extracted files.")

# Uncomment to preview (works in Jupyter):
# preview_images()

# %% Cell 14: Cleanup Utility (Optional)

def cleanup_output():
    """Remove all extracted files from output directories."""
    import shutil

    if os.path.exists(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)
        print(f"üóëÔ∏è Removed all files from {OUTPUT_DIR}")

    # Recreate directories
    os.makedirs(FIGURES_DIR, exist_ok=True)
    os.makedirs(TABLES_DIR, exist_ok=True)
    print("üìÅ Output directories recreated")

# Uncomment to cleanup:
cleanup_output()


üóëÔ∏è Removed all files from output
üìÅ Output directories recreated


# FULL TEST

In [None]:
# %% [markdown]
# # PDF Figure and Table Extractor
#
# Extracts ALL figures and tables from a PDF and saves them as PNG images.
# - Figures saved in `Figures/` folder as `figure_X_X.png`
# - Tables saved in `Tables/` folder as `table_X_X.png`
#
# **Supports:**
# - Numeric captions: Figure 1.1, Table 2.3
# - Appendix captions: Figure A.1, Table B.2
# - Both colon (`:`) and period (`.`) formats

# %% Cell 1: Install Dependencies

import subprocess
import sys

packages = ['pymupdf', 'Pillow']
for pkg in packages:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg, '-q', '--break-system-packages'])
print("‚úÖ Packages installed!")

# %% Cell 2: Import Libraries

import os
import re
import fitz  # PyMuPDF
from PIL import Image

print("‚úÖ Libraries imported!")

# %% Cell 3: Configuration
# ============================================
# CHANGE THESE SETTINGS
# ============================================

PDF_PATH = "riscv-spec-20191213.pdf"  # <-- Path to your PDF file
OUTPUT_DIR = "output"           # <-- Output directory
DPI = 300                       # <-- Image quality (150-300 recommended)

# ============================================

FIGURES_DIR = os.path.join(OUTPUT_DIR, "Figures")
TABLES_DIR = os.path.join(OUTPUT_DIR, "Tables")
os.makedirs(FIGURES_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)

print(f"üìÅ Configuration:")
print(f"   PDF: {PDF_PATH}")
print(f"   Figures: {FIGURES_DIR}")
print(f"   Tables: {TABLES_DIR}")
print(f"   DPI: {DPI}")

# %% Cell 4: Smart Caption Finder

def find_captions_smart(doc, item_type, pattern_name):
    """
    Find captions with both numeric (1.1) and letter (A.1) formats.
    Handles both colon and period caption styles.
    """
    captions = {}

    # Patterns for NUMERIC format (1.1, 2.3, etc.)
    num_colon = re.compile(rf'({pattern_name}\s*(\d+\.\d+))\s*:', re.IGNORECASE)
    num_period = re.compile(rf'({pattern_name}\s*(\d+\.\d+))\.\s+[A-Z]', re.IGNORECASE)

    # Patterns for LETTER format (A.1, B.3, etc.) - appendices
    letter_colon = re.compile(rf'({pattern_name}\s*([A-Z]\.\d+))\s*:', re.IGNORECASE)
    letter_period = re.compile(rf'({pattern_name}\s*([A-Z]\.\d+))\.\s+[A-Z]', re.IGNORECASE)

    all_patterns = [
        (num_colon, ':'),
        (letter_colon, ':'),
        (num_period, '.'),
        (letter_period, '.'),
    ]

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()

        for pattern, suffix in all_patterns:
            for match in pattern.finditer(text):
                caption_text = match.group(1)
                number = match.group(2)

                if number in captions:
                    continue

                search_text = f"{caption_text}{suffix}"
                instances = page.search_for(search_text)

                if instances:
                    rect = instances[0]

                    # For period patterns, verify followed by description
                    if suffix == '.':
                        clip = fitz.Rect(rect.x1, rect.y0 - 5, rect.x1 + 50, rect.y1 + 5)
                        after_text = page.get_text(clip=clip).strip()
                        if not (after_text and len(after_text) > 0 and after_text[0].isupper()):
                            continue

                    captions[number] = {
                        'type': item_type,
                        'number': number,
                        'page': page_num,
                        'rect': rect,
                        'y': rect.y0
                    }

    return list(captions.values())

print("‚úÖ Caption finder defined!")

# %% Cell 5: Main Extraction Function

def extract_all(pdf_path, figures_dir, tables_dir, dpi=300):
    """Extract all figures and tables from the PDF."""

    print("=" * 60)
    print("üîç SCANNING PDF FOR FIGURES AND TABLES")
    print("=" * 60)

    doc = fitz.open(pdf_path)

    # Find all captions
    figure_captions = find_captions_smart(doc, 'figure', 'Figure')
    table_captions = find_captions_smart(doc, 'table', 'Table')

    all_captions = figure_captions + table_captions
    all_captions.sort(key=lambda x: (x['page'], x['y']))

    print(f"\nüìä Found {len(all_captions)} items:")
    print(f"   ‚Ä¢ Figures: {len(figure_captions)}")
    print(f"   ‚Ä¢ Tables: {len(table_captions)}")

    if not all_captions:
        print("\n‚ö†Ô∏è No captions found!")
        doc.close()
        return []

    # Extract images
    print("\n" + "=" * 60)
    print("üì∏ EXTRACTING IMAGES")
    print("=" * 60)

    zoom = dpi / 72
    mat = fitz.Matrix(zoom, zoom)
    saved = []

    for cap in all_captions:
        page_num = cap['page']
        item_type = cap['type']
        number = cap['number']
        caption_rect = cap['rect']

        page = doc[page_num]
        page_rect = page.rect

        # Find top boundary (previous caption on same page)
        top_y = 50
        for other in all_captions:
            if other['page'] == page_num and other['y'] < cap['y'] - 20:
                top_y = max(top_y, other['rect'].y1 + 10)

        # Crop region
        crop_rect = fitz.Rect(
            35, top_y,
            page_rect.width - 35,
            min(caption_rect.y1 + 25, page_rect.height - 20)
        )
        crop_rect = crop_rect & page_rect

        if crop_rect.height < 30:
            continue

        pix = page.get_pixmap(matrix=mat, clip=crop_rect)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Generate filename (replace . with _ for consistency)
        safe_number = number.replace('.', '_')

        if item_type == 'figure':
            filename = f"figure_{safe_number}.png"
            filepath = os.path.join(figures_dir, filename)
        else:
            filename = f"table_{safe_number}.png"
            filepath = os.path.join(tables_dir, filename)

        img.save(filepath, 'PNG')
        saved.append({
            'type': item_type,
            'number': number,
            'filename': filename,
            'page': page_num + 1
        })
        print(f"  ‚úÖ {item_type.capitalize()} {number} -> {filename} (page {page_num + 1})")

    doc.close()

    # Summary
    print("\n" + "=" * 60)
    print("‚úÖ EXTRACTION COMPLETE")
    print("=" * 60)
    print(f"\n   Total: {len(saved)} items")
    print(f"   ‚Ä¢ Figures: {sum(1 for s in saved if s['type'] == 'figure')}")
    print(f"   ‚Ä¢ Tables: {sum(1 for s in saved if s['type'] == 'table')}")

    return saved

print("‚úÖ Extraction function defined!")

# %% Cell 6: Run Extraction
# Make sure PDF_PATH is set correctly in Cell 3!

if os.path.exists(PDF_PATH):
    results = extract_all(PDF_PATH, FIGURES_DIR, TABLES_DIR, DPI)
else:
    print(f"‚ùå PDF not found: {PDF_PATH}")
    print("   Please update PDF_PATH in Cell 3")

# %% Cell 7: List Extracted Files

def list_files():
    """List all extracted files."""
    print("\n" + "=" * 60)
    print("üìÅ EXTRACTED FILES")
    print("=" * 60)

    def sort_key(filename):
        # Extract the number part (e.g., "1_1" or "A_1" from "figure_1_1.png")
        match = re.search(r'_([A-Z0-9]+_\d+)\.png', filename)
        if match:
            parts = match.group(1).split('_')
            result = []
            for p in parts:
                if p.isdigit():
                    result.append((0, int(p)))
                else:
                    result.append((1, p))
            return result
        return [(0, 0)]

    # Figures
    print("\nüì∑ FIGURES:")
    if os.path.exists(FIGURES_DIR) and os.listdir(FIGURES_DIR):
        for f in sorted(os.listdir(FIGURES_DIR), key=sort_key):
            size_kb = os.path.getsize(os.path.join(FIGURES_DIR, f)) / 1024
            print(f"   {f} ({size_kb:.1f} KB)")
    else:
        print("   (none)")

    # Tables
    print("\nüìä TABLES:")
    if os.path.exists(TABLES_DIR) and os.listdir(TABLES_DIR):
        for f in sorted(os.listdir(TABLES_DIR), key=sort_key):
            size_kb = os.path.getsize(os.path.join(TABLES_DIR, f)) / 1024
            print(f"   {f} ({size_kb:.1f} KB)")
    else:
        print("   (none)")

    fig_count = len(os.listdir(FIGURES_DIR)) if os.path.exists(FIGURES_DIR) else 0
    tbl_count = len(os.listdir(TABLES_DIR)) if os.path.exists(TABLES_DIR) else 0
    print(f"\n   TOTAL: {fig_count} figures, {tbl_count} tables")

list_files()

# %% Cell 8: Verify Extraction

def verify(pdf_path):
    """Check if any figures/tables might be missing."""
    if not os.path.exists(pdf_path):
        return

    doc = fitz.open(pdf_path)

    # Find all mentioned figures/tables (both numeric and letter formats)
    all_figs = set()
    all_tbls = set()

    fig_pattern = re.compile(r'Figure\s*(\d+\.\d+|[A-Z]\.\d+)', re.IGNORECASE)
    tbl_pattern = re.compile(r'Table\s*(\d+\.\d+|[A-Z]\.\d+)', re.IGNORECASE)

    for page in doc:
        text = page.get_text()
        for m in fig_pattern.finditer(text):
            all_figs.add(m.group(1))
        for m in tbl_pattern.finditer(text):
            all_tbls.add(m.group(1))

    doc.close()

    # What was extracted
    extracted_figs = set()
    extracted_tbls = set()

    if os.path.exists(FIGURES_DIR):
        for f in os.listdir(FIGURES_DIR):
            # Convert filename back to number (figure_A_1.png -> A.1)
            num = f.replace('figure_', '').replace('.png', '').replace('_', '.')
            extracted_figs.add(num)

    if os.path.exists(TABLES_DIR):
        for f in os.listdir(TABLES_DIR):
            num = f.replace('table_', '').replace('.png', '').replace('_', '.')
            extracted_tbls.add(num)

    print("\n" + "=" * 60)
    print("üîç VERIFICATION")
    print("=" * 60)

    def sort_key(x):
        parts = x.split('.')
        result = []
        for p in parts:
            if p.isdigit():
                result.append((0, int(p)))
            else:
                result.append((1, p))
        return result

    print(f"\nüì∑ Figures: {len(extracted_figs)} extracted / {len(all_figs)} mentioned")
    missing_figs = all_figs - extracted_figs
    if missing_figs:
        print(f"   ‚ö†Ô∏è Not extracted: {sorted(missing_figs, key=sort_key)}")
        print(f"   (These may be references only)")
    else:
        print(f"   ‚úÖ All figures extracted!")

    print(f"\nüìä Tables: {len(extracted_tbls)} extracted / {len(all_tbls)} mentioned")
    missing_tbls = all_tbls - extracted_tbls
    if missing_tbls:
        print(f"   ‚ö†Ô∏è Not extracted: {sorted(missing_tbls, key=sort_key)}")
        print(f"   (These may be references only)")
    else:
        print(f"   ‚úÖ All tables extracted!")

if os.path.exists(PDF_PATH):
    verify(PDF_PATH)

# %% Cell 9: Cleanup (Optional)

def cleanup():
    """Remove all extracted files."""
    import shutil
    if os.path.exists(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)
    os.makedirs(FIGURES_DIR, exist_ok=True)
    os.makedirs(TABLES_DIR, exist_ok=True)
    print("üóëÔ∏è All files removed!")

# Uncomment to run:
# cleanup()

‚úÖ Packages installed!
‚úÖ Libraries imported!
üìÅ Configuration:
   PDF: riscv-spec-20191213.pdf
   Figures: output/Figures
   Tables: output/Tables
   DPI: 300
‚úÖ Caption finder defined!
‚úÖ Extraction function defined!
üîç SCANNING PDF FOR FIGURES AND TABLES

üìä Found 79 items:
   ‚Ä¢ Figures: 41
   ‚Ä¢ Tables: 38

üì∏ EXTRACTING IMAGES
  ‚úÖ Figure 1.1 -> figure_1_1.png (page 26)
  ‚úÖ Table 1.1 -> table_1_1.png (page 29)
  ‚úÖ Figure 2.1 -> figure_2_1.png (page 32)
  ‚úÖ Figure 2.2 -> figure_2_2.png (page 34)
  ‚úÖ Figure 2.3 -> figure_2_3.png (page 34)
  ‚úÖ Figure 2.4 -> figure_2_4.png (page 35)
  ‚úÖ Table 2.1 -> table_2_1.png (page 39)
  ‚úÖ Table 2.2 -> table_2_2.png (page 44)
  ‚úÖ Table 2.3 -> table_2_3.png (page 47)
  ‚úÖ Table 5.1 -> table_5_1.png (page 57)
  ‚úÖ Table 7.1 -> table_7_1.png (page 62)
  ‚úÖ Figure 8.1 -> figure_8_1.png (page 68)
  ‚úÖ Figure 8.2 -> figure_8_2.png (page 72)
  ‚úÖ Table 9.1 -> table_9_1.png (page 74)
  ‚úÖ Figure 10.1 -> figure_10_1.pn