In [1]:
import fitz
from docx import Document
from PIL import Image
import pytesseract
import easyocr
import os
import json
from anthropic import Anthropic

In [2]:
def extract_text_from_pdf(filepath):
    """Extract text from PDF file."""
    text = ""
    try:
        doc = fitz.open(filepath)
        for page_num, page in enumerate(doc):
            text += f"\n--- Page {page_num + 1} ---\n"
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error extracting PDF: {e}")
        return ""

In [3]:
def extract_text_from_docx(filepath):
    """Extract text from Word document (.docx)."""
    text = ""
    try:
        doc = Document(filepath)
        for para in doc.paragraphs:
            if para.text.strip():
                text += para.text + "\n"
        for table in doc.tables:
            for row in table.rows:
                row_text = [cell.text for cell in row.cells]
                text += " | ".join(row_text) + "\n"
        return text
    except Exception as e:
        print(f"Error extracting DOCX: {e}")
        return ""

In [4]:
def extract_text_from_image(filepath, ocr_method='easyocr', languages=['en']):
    """Extract text from image using OCR."""
    try:
        if ocr_method == 'pytesseract':
            image = Image.open(filepath)
            return pytesseract.image_to_string(image)
        else:
            reader = easyocr.Reader(languages)
            results = reader.readtext(filepath)
            return " ".join([detection[1] for detection in results])
    except Exception as e:
        print(f"Error extracting text from image: {e}")
        return ""

In [5]:
def extract_text_from_file(filepath, ocr_method='easyocr', languages=['en']):
    """Extract text from any supported file type."""
    file_ext = os.path.splitext(filepath)[1].lower()
    
    if file_ext == '.pdf':
        return extract_text_from_pdf(filepath)
    elif file_ext == '.docx':
        return extract_text_from_docx(filepath)
    elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff']:
        return extract_text_from_image(filepath, ocr_method, languages)
    else:
        print(f"Unsupported file type: {file_ext}")
        return ""


In [6]:
def extract_information_ai(text, extraction_instructions=None):
    """
    Use Claude AI to intelligently extract relevant information from document.
    """
    client = Anthropic()
    
    if extraction_instructions is None:
        extraction_instructions = """Extract all important and relevant information from this document.
        Ignore filler, boilerplate, headers, footers, and irrelevant content.
        Return comprehensive structured data with all key details."""
    
    prompt = f"""You are an expert information extraction specialist.

Your task: {extraction_instructions}

From the document below, extract ONLY the most important and relevant information.
Ignore any irrelevant, redundant, boilerplate, footer, or noise content.

Return the extracted information as valid JSON. Be comprehensive but concise.

DOCUMENT:
---
{text}
---

Return ONLY valid JSON format. No additional text or explanation."""
    
    try:
        message = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=2048,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        
        response_text = message.content[0].text.strip()
        
        # Try to parse JSON
        try:
            result = json.loads(response_text)
            return result
        except json.JSONDecodeError:
            # If not valid JSON, return as text
            return {"extracted_content": response_text}
    
    except Exception as e:
        print(f"Error during AI extraction: {e}")
        return {"error": str(e)}


In [8]:
def extract_from_document(filepath, extraction_instructions=None, ocr_method='easyocr'):
    """
    Complete pipeline: Extract text from document, then use AI to extract information.
    
    Args:
        filepath (str): Path to document (PDF, DOCX, or Image)
        extraction_instructions (str): What specific information to extract
        ocr_method (str): 'pytesseract' or 'easyocr' for images
        
    Returns:
        dict: Extracted information
    """
    print(f"ðŸ“„ Extracting text from: {filepath}")
    text = extract_text_from_file(filepath, ocr_method)
    
    if not text:
        return {"error": "Could not extract text from document"}
    
    print(f"âœ… Text extracted ({len(text)} characters)")
    print(f"ðŸ¤– Running AI extraction...\n")
    
    result = extract_information_ai(text, extraction_instructions)
    return result