In [1]:
!pip install llmsherpa



In [6]:
# No longer need 'google.colab'
from llmsherpa.readers import LayoutPDFReader
import os

# 1. DEFINE YOUR LOCAL FILE PATH
# Replace "your_file_name.pdf" with the actual name of your PDF file.
pdf_path = "AB_MS365_-_Central_Services_-_SOC_1_Report_(9-30-2024)_.pdf"

# Check if the file exists before proceeding
if not os.path.exists(pdf_path):
    print(f"Error: File not found at '{pdf_path}'")
    print("Please make sure the PDF file is in the same directory as your notebook.")
else:
    try:
        # 2. CORRECTED API URL
        # The URL for the reader should not include query parameters.
        api_url = "http://localhost:5010/api/parseDocument"
        reader = LayoutPDFReader(parser_api_url=api_url)
        
        # The reader directly accepts the file path
        print(f"Processing {pdf_path}...")
        doc = reader.read_pdf(pdf_path)

        markdown_output = ""
        # Using doc.sections() is a more direct way to iterate
        for section in doc.sections():
            markdown_output += f"### {section.title}\n\n"
            markdown_output += section.to_text(include_children=True, recurse=True) + "\n\n"

        print("--- Document Content ---")
        print(markdown_output)
        print("------------------------")

    except Exception as e:
        print(f"An error occurred: {e}")

Processing AB_MS365_-_Central_Services_-_SOC_1_Report_(9-30-2024)_.pdf...
--- Document Content ---
### Microsoft Corporation—

Microsoft Corporation—
Microsoft 365 Central Services
System and Organization Controls (SOC) 1 Report
October 1, 2023, through September 30, 2024
This report, including the description of tests of controls and results in Section 4, is intended solely for the information and use of management of Microsoft, user entities of Microsoft’s M365 Central Services system during some or all of the period October 1, 2023, to September 30, 2024, and their auditors who audit and report on such user entities’ financial statements or internal control over financial reporting and have a sufficient understanding to consider it, along with other information including information about controls implemented by user entities themselves, when assessing the risks of material misstatement of user entities’ financial statements.
This report is not intended to be and should not be used 

In [7]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   -- ------------------------------------- 1.3/18.7 MB 7.4 MB/s eta 0:00:03
   ------------- -------------------------- 6.3/18.7 MB 16.1 MB/s eta 0:00:01
   ------------------------- -------------- 12.1/18.7 MB 20.4 MB/s eta 0:00:01
   ------------------------------------- -- 17.6/18.7 MB 21.3 MB/s eta 0:00:01
   ---------------------------------------- 18.7/18.7 MB 20.4 MB/s  0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [None]:
import os
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
from llmsherpa.readers import LayoutPDFReader

class PDFProcessor:
    def __init__(self, sherpa_url="http://localhost:5010/api/parseDocument?renderFormat=all"):
        """
        Initialize PDF processor with LLMSherpa connection
        
        Args:
            sherpa_url: URL of your local LLMSherpa Docker instance
        """
        self.sherpa_url = sherpa_url
        self.reader = LayoutPDFReader(sherpa_url)
        self.output_dir = Path("output")
        self.setup_output_directories()
    
    def setup_output_directories(self):
        """Create output directory structure"""
        directories = [
            self.output_dir,
            self.output_dir / "text",
            self.output_dir / "tables", 
            self.output_dir / "images",
            self.output_dir / "metadata",
            self.output_dir / "reports"
        ]
        
        for directory in directories:
            directory.mkdir(parents=True, exist_ok=True)
        
        print(f"✓ Output directories created at: {self.output_dir.absolute()}")
    
    def validate_pdf_path(self, pdf_path):
        """
        Validate if the provided PDF path exists and is a PDF file
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            bool: True if valid, False otherwise
        """
        path = Path(pdf_path)
        
        if not path.exists():
            print(f"❌ Error: File does not exist: {pdf_path}")
            return False
        
        if not path.suffix.lower() == '.pdf':
            print(f"❌ Error: File is not a PDF: {pdf_path}")
            return False
        
        print(f"✓ Valid PDF file: {pdf_path}")
        return True
    
    def process_pdf(self, pdf_path):
        """
        Process PDF using LLMSherpa and extract content
        
        Args:
            pdf_path: Path to the PDF file to process
            
        Returns:
            dict: Processing results and statistics
        """
        if not self.validate_pdf_path(pdf_path):
            return None
        
        try:
            print(f"🔄 Processing PDF: {pdf_path}")
            print(f"🔗 Using LLMSherpa at: {self.sherpa_url}")
            
            # Parse document using LLMSherpa
            doc = self.reader.read_pdf(pdf_path)
            
            # Extract content
            results = self.extract_content(doc, pdf_path)
            
            # Save results
            self.save_results(results, pdf_path)
            
            print("✅ PDF processing completed successfully!")
            return results
            
        except Exception as e:
            print(f"❌ Error processing PDF: {str(e)}")
            return None
    
    def extract_content(self, doc, pdf_path):
        """
        Extract different types of content from the parsed document
        
        Args:
            doc: Parsed document from LLMSherpa
            pdf_path: Original PDF path
            
        Returns:
            dict: Extracted content organized by type
        """
        results = {
            'metadata': {
                'source_file': pdf_path,
                'processed_at': datetime.now().isoformat(),
                'total_pages': len(doc.chunks()) if hasattr(doc, 'chunks') else 0,
                'total_sections': len(doc.sections()) if hasattr(doc, 'sections') else 0,
                'total_tables': len(doc.tables()) if hasattr(doc, 'tables') else 0
            },
            'text_content': [],
            'tables': [],
            'images': [],
            'sections': []
        }
        
        # Extract text content
        print("📝 Extracting text content...")
        try:
            for chunk in doc.chunks():
                results['text_content'].append({
                    'content': chunk.to_text(),
                    'page_number': getattr(chunk, 'page_idx', None),
                    'chunk_type': getattr(chunk, 'tag', 'text')
                })
        except Exception as e:
            print(f"⚠️  Warning: Error extracting text content: {e}")
        
        # Extract tables
        print("📊 Extracting tables...")
        try:
            for i, table in enumerate(doc.tables()):
                table_data = {
                    'table_id': i,
                    'content': table.to_text(),
                    'html': getattr(table, 'to_html', lambda: '')(),
                    'page_number': getattr(table, 'page_idx', None)
                }
                results['tables'].append(table_data)
        except Exception as e:
            print(f"⚠️  Warning: Error extracting tables: {e}")
        
        # Extract sections/hierarchy
        print("📋 Extracting document structure...")
        try:
            for section in doc.sections():
                results['sections'].append({
                    'title': section.title,
                    'content': section.to_text(),
                    'level': getattr(section, 'level', 0),
                    'page_number': getattr(section, 'page_idx', None)
                })
        except Exception as e:
            print(f"⚠️  Warning: Error extracting sections: {e}")
        
        return results
    
    def save_results(self, results, pdf_path):
        """
        Save extracted content to organized file structure
        
        Args:
            results: Extracted content dictionary
            pdf_path: Original PDF path for naming
        """
        pdf_name = Path(pdf_path).stem
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save metadata
        metadata_file = self.output_dir / "metadata" / f"{pdf_name}_{timestamp}_metadata.json"
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(results['metadata'], f, indent=2, ensure_ascii=False)
        
        # Save text content
        text_file = self.output_dir / "text" / f"{pdf_name}_{timestamp}_text.json"
        with open(text_file, 'w', encoding='utf-8') as f:
            json.dump(results['text_content'], f, indent=2, ensure_ascii=False)
        
        # Save sections
        sections_file = self.output_dir / "text" / f"{pdf_name}_{timestamp}_sections.json"
        with open(sections_file, 'w', encoding='utf-8') as f:
            json.dump(results['sections'], f, indent=2, ensure_ascii=False)
        
        # Save tables
        if results['tables']:
            tables_file = self.output_dir / "tables" / f"{pdf_name}_{timestamp}_tables.json"
            with open(tables_file, 'w', encoding='utf-8') as f:
                json.dump(results['tables'], f, indent=2, ensure_ascii=False)
            
            # Also save as CSV if possible
            for i, table in enumerate(results['tables']):
                try:
                    csv_file = self.output_dir / "tables" / f"{pdf_name}_{timestamp}_table_{i}.csv"
                    # You might need to parse the table content to create proper CSV
                    with open(csv_file, 'w', encoding='utf-8') as f:
                        f.write(table['content'])
                except Exception as e:
                    print(f"⚠️  Warning: Could not save table {i} as CSV: {e}")
        
        # Generate processing report
        self.generate_report(results, pdf_path, timestamp)
        
        print(f"📁 Results saved to: {self.output_dir.absolute()}")
    
    def generate_report(self, results, pdf_path, timestamp):
        """
        Generate a summary report of the extraction process
        
        Args:
            results: Extracted content dictionary
            pdf_path: Original PDF path
            timestamp: Processing timestamp
        """
        pdf_name = Path(pdf_path).stem
        report_file = self.output_dir / "reports" / f"{pdf_name}_{timestamp}_report.txt"
        
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(f"PDF Processing Report\n")
            f.write(f"==================\n\n")
            f.write(f"Source File: {pdf_path}\n")
            f.write(f"Processed At: {results['metadata']['processed_at']}\n")
            f.write(f"Total Pages: {results['metadata']['total_pages']}\n")
            f.write(f"Total Sections: {results['metadata']['total_sections']}\n")
            f.write(f"Total Tables: {results['metadata']['total_tables']}\n")
            f.write(f"Text Chunks: {len(results['text_content'])}\n\n")
            
            f.write("Content Summary:\n")
            f.write("-" * 20 + "\n")
            
            if results['sections']:
                f.write("Document Sections:\n")
                for i, section in enumerate(results['sections'][:10]):  # First 10 sections
                    f.write(f"  {i+1}. {section['title'][:100]}...\n")
                if len(results['sections']) > 10:
                    f.write(f"  ... and {len(results['sections']) - 10} more sections\n")
                f.write("\n")
            
            if results['tables']:
                f.write("Extracted Tables:\n")
                for i, table in enumerate(results['tables']):
                    f.write(f"  Table {i+1}: Page {table.get('page_number', 'N/A')}\n")
                f.write("\n")
    
    def interactive_mode(self):
        """
        Interactive mode to accept PDF path from user
        """
        print("🚀 LLMSherpa PDF Processor")
        print("=" * 40)
        print(f"Connected to LLMSherpa at: {self.sherpa_url}")
        print()
        
        while True:
            try:
                pdf_path = input("📄 Enter PDF file path (or 'quit' to exit): ").strip()
                
                if pdf_path.lower() in ['quit', 'exit', 'q']:
                    print("👋 Goodbye!")
                    break
                
                if not pdf_path:
                    print("⚠️  Please provide a valid file path")
                    continue
                
                # Remove quotes if present
                pdf_path = pdf_path.strip('"\'')
                
                # Process the PDF
                results = self.process_pdf(pdf_path)
                
                if results:
                    print(f"✅ Successfully processed: {Path(pdf_path).name}")
                    print(f"   - Pages: {results['metadata']['total_pages']}")
                    print(f"   - Sections: {results['metadata']['total_sections']}")
                    print(f"   - Tables: {results['metadata']['total_tables']}")
                    print(f"   - Text Chunks: {len(results['text_content'])}")
                else:
                    print("❌ Failed to process PDF")
                
                print("\n" + "-" * 50 + "\n")
                
            except KeyboardInterrupt:
                print("\n👋 Goodbye!")
                break
            except Exception as e:
                print(f"❌ Unexpected error: {e}")

def main():
    """Main function to run the PDF processor"""
    processor = PDFProcessor()
    processor.interactive_mode()

if __name__ == "__main__":
    main()

✓ Output directories created at: c:\Users\user\Desktop\smart doc chattbot\smart-doc-chatbot\output
🚀 LLMSherpa PDF Processor
Connected to LLMSherpa at: http://localhost:5010/api/parseDocument?renderFormat=all

✓ Valid PDF file: C:\Users\user\Desktop\smart doc chattbot\smart-doc-chatbot\AB_MS365_-_Central_Services_-_SOC_1_Report_(9-30-2024)_.pdf
🔄 Processing PDF: C:\Users\user\Desktop\smart doc chattbot\smart-doc-chatbot\AB_MS365_-_Central_Services_-_SOC_1_Report_(9-30-2024)_.pdf
🔗 Using LLMSherpa at: http://localhost:5010/api/parseDocument?renderFormat=all
❌ Error processing PDF: No host specified.
❌ Failed to process PDF

--------------------------------------------------

✓ Valid PDF file: C:\Users\user\Desktop\smart doc chattbot\smart-doc-chatbot\AB_MS365_-_Central_Services_-_SOC_1_Report_(9-30-2024)_.pdf
🔄 Processing PDF: C:\Users\user\Desktop\smart doc chattbot\smart-doc-chatbot\AB_MS365_-_Central_Services_-_SOC_1_Report_(9-30-2024)_.pdf
🔗 Using LLMSherpa at: http://localhost:5010