In [1]:
import fitz  # PyMuPDF
import pdfplumber
import camelot
from typing import Dict, List, Any
import pandas as pd

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [2]:


def parse_pdf_first_10_pages(pdf_path: str) -> Dict[str, Any]:
    """
    Parse the first 10 pages of a PDF using PyMuPDF, pdfplumber, and camelot.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        Dict containing extracted data from all three libraries
    """
    result = {
        'pdf_path': pdf_path,
        'pages_processed': 0,
        'pymupdf_data': [],
        'pdfplumber_data': [],
        'camelot_data': []
    }
    
    try:
        # PyMuPDF extraction
        print("Extracting with PyMuPDF...")
        with fitz.open(pdf_path) as doc:
            total_pages = min(len(doc), 10)
            result['pages_processed'] = total_pages
            
            for page_num in range(total_pages):
                page = doc[page_num]
                
                # Extract text
                text = page.get_text()
                
                # Extract text with layout info
                text_dict = page.get_text("dict")
                
                # Get page dimensions
                rect = page.rect
                
                pymupdf_page = {
                    'page_number': page_num + 1,
                    'text': text,
                    'text_dict': text_dict,
                    'width': rect.width,
                    'height': rect.height,
                    'char_count': len(text)
                }
                
                result['pymupdf_data'].append(pymupdf_page)
        
        # pdfplumber extraction
        print("Extracting with pdfplumber...")
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = min(len(pdf.pages), 10)
            
            for page_num in range(total_pages):
                page = pdf.pages[page_num]
                
                # Extract text
                text = page.extract_text() or ""
                
                # Extract tables
                tables = page.extract_tables()
                
                # Get page info
                page_info = {
                    'width': page.width,
                    'height': page.height,
                    'rotation': getattr(page, 'rotation', 0)
                }
                
                pdfplumber_page = {
                    'page_number': page_num + 1,
                    'text': text,
                    'tables': tables,
                    'table_count': len(tables) if tables else 0,
                    'page_info': page_info,
                    'char_count': len(text)
                }
                
                result['pdfplumber_data'].append(pdfplumber_page)
        
        # Camelot extraction (tables only)
        print("Extracting tables with camelot...")
        try:
            # Extract tables from first 10 pages
            page_range = f"1-{min(10, result['pages_processed'])}"
            tables = camelot.read_pdf(pdf_path, pages=page_range, flavor='lattice')
            
            # If lattice doesn't work, try stream
            if len(tables) == 0:
                tables = camelot.read_pdf(pdf_path, pages=page_range, flavor='stream')
            
            camelot_tables = []
            for i, table in enumerate(tables):
                table_data = {
                    'table_number': i + 1,
                    'page_number': table.page,
                    'shape': table.shape,
                    'accuracy': table.accuracy,
                    'whitespace': table.whitespace,
                    'dataframe': table.df.to_dict('records'),  # Convert to dict for JSON serialization
                    'raw_text': table.df.to_string()
                }
                camelot_tables.append(table_data)
            
            result['camelot_data'] = camelot_tables
            
        except Exception as e:
            print(f"Camelot extraction failed: {e}")
            result['camelot_data'] = []
            result['camelot_error'] = str(e)
        
        # Summary statistics
        result['summary'] = {
            'total_pymupdf_chars': sum(page['char_count'] for page in result['pymupdf_data']),
            'total_pdfplumber_chars': sum(page['char_count'] for page in result['pdfplumber_data']),
            'total_pdfplumber_tables': sum(page['table_count'] for page in result['pdfplumber_data']),
            'total_camelot_tables': len(result['camelot_data'])
        }
        
        print(f"Extraction completed successfully!")
        print(f"Pages processed: {result['pages_processed']}")
        print(f"PyMuPDF chars: {result['summary']['total_pymupdf_chars']}")
        print(f"pdfplumber chars: {result['summary']['total_pdfplumber_chars']}")
        print(f"pdfplumber tables: {result['summary']['total_pdfplumber_tables']}")
        print(f"Camelot tables: {result['summary']['total_camelot_tables']}")
        
    except Exception as e:
        print(f"Error processing PDF: {e}")
        result['error'] = str(e)
    
    return result

def save_extraction_results(result: Dict[str, Any], output_path: str):
    """
    Save the extraction results to a JSON file.
    """
    import json
    
    with open(output_path, 'w') as f:
        json.dump(result, f, indent=2, default=str)
    
    print(f"Results saved to: {output_path}")


result = parse_pdf_first_10_pages("./HB410-SD1-(OHA).pdf")

if 'error' not in result:
    save_extraction_results(result, 'output.json')
else:
    print(f"Processing failed: {result['error']}")

Extracting with PyMuPDF...
Extracting with pdfplumber...
Extracting tables with camelot...
Extraction completed successfully!
Pages processed: 7
PyMuPDF chars: 8158
pdfplumber chars: 7924
pdfplumber tables: 0
Camelot tables: 7
Results saved to: output.json


In [3]:
import fitz  # PyMuPDF
import pdfplumber
import camelot
from typing import Dict, List, Any
import json

def parse_pdf_first_10_pages(pdf_path: str) -> Dict[str, Any]:
    """
    Parse the first 10 pages of a PDF using PyMuPDF, pdfplumber, and camelot.
    Stores output as simple text properties.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        Dict containing extracted text data from all three libraries
    """
    result = {
        'pdf_path': pdf_path,
        'pages_processed': 0,
        'pages': []
    }
    
    try:
        # Get total pages to process
        with fitz.open(pdf_path) as doc:
            total_pages = min(len(doc), 10)
            result['pages_processed'] = total_pages
        
        # Process each page
        for page_num in range(total_pages):
            page_data = {
                'page_number': page_num + 1,
                'pymupdf_text': '',
                'pdfplumber_text': '',
                'camelot_text': ''
            }
            
            # PyMuPDF extraction
            try:
                with fitz.open(pdf_path) as doc:
                    page = doc[page_num]
                    page_data['pymupdf_text'] = page.get_text().strip()
            except Exception as e:
                page_data['pymupdf_text'] = f"PyMuPDF error: {str(e)}"
            
            # pdfplumber extraction
            try:
                with pdfplumber.open(pdf_path) as pdf:
                    page = pdf.pages[page_num]
                    text = page.extract_text() or ""
                    
                    # Also extract tables and convert to text
                    tables = page.extract_tables()
                    if tables:
                        table_text = "\n\n--- TABLES ---\n"
                        for i, table in enumerate(tables):
                            table_text += f"\nTable {i+1}:\n"
                            for row in table:
                                if row:  # Skip empty rows
                                    table_text += " | ".join(str(cell) if cell else "" for cell in row) + "\n"
                        text += table_text
                    
                    page_data['pdfplumber_text'] = text.strip()
            except Exception as e:
                page_data['pdfplumber_text'] = f"pdfplumber error: {str(e)}"
            
            result['pages'].append(page_data)
        
        # Camelot extraction (process all pages at once)
        print("Extracting tables with camelot...")
        try:
            page_range = f"1-{total_pages}"
            tables = camelot.read_pdf(pdf_path, pages=page_range, flavor='lattice')
            
            # If lattice doesn't work, try stream
            if len(tables) == 0:
                tables = camelot.read_pdf(pdf_path, pages=page_range, flavor='stream')
            
            # Organize tables by page
            camelot_by_page = {}
            for table in tables:
                page_num = table.page
                if page_num not in camelot_by_page:
                    camelot_by_page[page_num] = []
                
                # Convert table to text
                table_text = f"Table (accuracy: {table.accuracy:.2f}):\n"
                table_text += table.df.to_string(index=False)
                camelot_by_page[page_num].append(table_text)
            
            # Add camelot text to each page
            for page_data in result['pages']:
                page_num = page_data['page_number']
                if page_num in camelot_by_page:
                    page_data['camelot_text'] = "\n\n".join(camelot_by_page[page_num])
                else:
                    page_data['camelot_text'] = "No tables found"
                    
        except Exception as e:
            print(f"Camelot extraction failed: {e}")
            for page_data in result['pages']:
                page_data['camelot_text'] = f"Camelot error: {str(e)}"
        
        # Summary statistics
        result['summary'] = {
            'total_pages': total_pages,
            'pymupdf_total_chars': sum(len(p['pymupdf_text']) for p in result['pages']),
            'pdfplumber_total_chars': sum(len(p['pdfplumber_text']) for p in result['pages']),
            'camelot_total_chars': sum(len(p['camelot_text']) for p in result['pages'])
        }
        
        print(f"Extraction completed successfully!")
        print(f"Pages processed: {result['pages_processed']}")
        print(f"PyMuPDF total chars: {result['summary']['pymupdf_total_chars']}")
        print(f"pdfplumber total chars: {result['summary']['pdfplumber_total_chars']}")
        print(f"Camelot total chars: {result['summary']['camelot_total_chars']}")
        
    except Exception as e:
        print(f"Error processing PDF: {e}")
        result['error'] = str(e)
    
    return result

def save_extraction_results(result: Dict[str, Any], output_path: str):
    """
    Save the extraction results to a JSON file.
    """
    with open(output_path, 'w') as f:
        json.dump(result, f, indent=2, default=str)
    
    print(f"Results saved to: {output_path}")

result = parse_pdf_first_10_pages("./HB410-SD1-(OHA).pdf")

if 'error' not in result:
    save_extraction_results(result, 'output.json')
else:
    print(f"Processing failed: {result['error']}")

Extracting tables with camelot...
Extraction completed successfully!
Pages processed: 7
PyMuPDF total chars: 8149
pdfplumber total chars: 7924
Camelot total chars: 21535
Results saved to: output.json
