# Energy Bill OCR Pipeline

This notebook processes 1,000 energy bill PDFs using VisionAgent and LandingAI Agentic Document Extraction API to extract structured JSON data.

## Overview
- **Input**: PDF files in `data/raw_data/`
- **Output**: JSON files in `output_json/`
- **APIs Used**: VisionAgent, LandingAI, Anthropic (fallback), Google Vision (OCR)
- **Processing**: Batch processing with error handling and resume capability

In [2]:
#!git clone --branch amy-workspace --single-branch <GITHUB TOKEN>/Automated-Utility-Bill-and-Contract-Processing-using-Agentic-RAG

fatal: destination path 'Automated-Utility-Bill-and-Contract-Processing-using-Agentic-RAG' already exists and is not an empty directory.


In [3]:
%cd /content/Automated-Utility-Bill-and-Contract-Processing-using-Agentic-RAG
!git pull origin amy-workspace

/content/Automated-Utility-Bill-and-Contract-Processing-using-Agentic-RAG
From https://github.com/UChicago-Capstone-Summer2025-Powerkiosk/Automated-Utility-Bill-and-Contract-Processing-using-Agentic-RAG
 * branch            amy-workspace -> FETCH_HEAD
Already up to date.


## 1. Setup and Configuration

In [4]:
import os

# Change the current working directory
project_root = "/content/Automated-Utility-Bill-and-Contract-Processing-using-Agentic-RAG"
os.chdir(project_root)

In [5]:
# Install required packages
!pip install -r requirements.txt



In [6]:
# Import required modules
import sys
import json
import getpass
import pdf2image
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pandas as pd

# Import custom modules (make sure these files are in your working directory or PYTHONPATH)
from config import PipelineConfig, LOG_CONFIG
from utils import DocumentProcessor, DataValidator, FileManager, ProgressTracker
from api_clients import HybridExtractionClient

# Setup logging
from loguru import logger
logger.add("logs/pipeline.log", **LOG_CONFIG)

print("All modules imported successfully")

ImportError: cannot import name 'VisionAgent' from 'vision_agent.agent' (/usr/local/lib/python3.11/dist-packages/vision_agent/agent/__init__.py)

## 2. API Key Configuration

In [21]:
# Collect API keys securely
print("Please enter your API keys:")
print("(Keys will not be displayed or stored in the notebook)\n")

vision_agent_key = getpass.getpass("VisionAgent API Key: ")
landing_ai_key = getpass.getpass("LandingAI API Key: ")
anthropic_key = getpass.getpass("Anthropic API Key: ")
google_key = getpass.getpass("Google API Key: ")

# Validate keys are provided
if not all([vision_agent_key, landing_ai_key, anthropic_key, google_key]):
    raise ValueError("All API keys are required")

print("API keys configured successfully")

Please enter your API keys:
(Keys will not be displayed or stored in the notebook)

VisionAgent API Key: ··········
LandingAI API Key: ··········
Anthropic API Key: ··········
Google API Key: ··········
API keys configured successfully


## 3. Initialize Pipeline Components

In [28]:
# Initialize configuration
config = PipelineConfig()
config.create_directories()

# Initialize components
doc_processor = DocumentProcessor(dpi=config.PDF_DPI)
validator = DataValidator()
file_manager = FileManager(config.INPUT_DIR, config.OUTPUT_DIR)
progress_tracker = ProgressTracker("logs/processing_progress.json")

# Initialize API client
api_client = HybridExtractionClient(
    vision_agent_key=vision_agent_key,
    landing_ai_key=landing_ai_key,
    anthropic_key=anthropic_key,
    google_key=google_key
)

print("Pipeline components initialized")
print(f"Input directory: {config.INPUT_DIR}")
print(f"Output directory: {config.OUTPUT_DIR}")
print(f"Logs directory: {config.LOG_DIR}")

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

## 4. Load Prompts and Validate Setup

In [None]:
# Load prompts from prompt.txt
try:
    with open('prompt.txt', 'r', encoding='utf-8') as f:
        prompt_content = f.read()

    # Extract individual prompts (assumes prompt format key = """value""")
    prompts = {}
    sections = prompt_content.split('\n\n')

    for section in sections:
        if ' = """' in section:
            key = section.split(' = """')[0].strip()
            value = section.split(' = """')[1].split('"""')[0].strip()
            prompts[key] = value

    print("Prompts loaded successfully")
    print(f"Available prompts: {list(prompts.keys())}")

except FileNotFoundError:
    print("prompt.txt not found. Please ensure it's in the current directory.")
    sys.exit(1)

## 5. File Discovery and Batch Planning

In [None]:
# Discover PDF files
pdf_files = file_manager.get_pdf_files()
print(f"Found {len(pdf_files)} PDF files")

# Check for already processed files
unprocessed_files = progress_tracker.get_unprocessed_files(pdf_files)
print(f"⏳ Unprocessed files: {len(unprocessed_files)}")

if len(unprocessed_files) == 0:
    print("All files have been processed!")
else:
    # Update progress tracking metadata
    progress_tracker.progress_data["total_files"] = len(pdf_files)
    progress_tracker.progress_data["start_time"] = datetime.now().isoformat()
    progress_tracker.save_progress()

    # Create batches
    batches = file_manager.create_batches(unprocessed_files, config.BATCH_SIZE)
    print(f"Created {len(batches)} batches of {config.BATCH_SIZE} files each")

    # Display sample files
    print("\nSample files to process:")
    for i, file_path in enumerate(unprocessed_files[:5]):
        print(f"  {i+1}. {file_path.name}")

    if len(unprocessed_files) > 5:
        print(f"  ... and {len(unprocessed_files) - 5} more files")

## 6. Define Processing Functions

In [None]:
def process_single_pdf(pdf_path: Path) -> dict:
    """
    Process a single PDF file through the complete pipeline
    """
    result = {
        'filename': pdf_path.name,
        'success': False,
        'data': None,
        'error': None,
        'processing_time': 0,
        'confidence_score': 0.0
    }

    start_time = datetime.now()

    try:
        logger.info(f"Processing: {pdf_path.name}")

        # Step 1: Convert PDF to images
        temp_dir = Path(config.TEMP_DIR) / pdf_path.stem
        temp_dir.mkdir(parents=True, exist_ok=True)

        image_paths = doc_processor.pdf_to_images(str(pdf_path), str(temp_dir))

        if not image_paths:
            raise Exception("Failed to convert PDF to images")

        # Step 2: Process each page and combine results
        combined_data = None
        best_confidence = 0.0

        for image_path in image_paths:
            try:
                # Extract data using hybrid approach
                extraction_result = api_client.extract_comprehensive_data(image_path)

                if extraction_result['combined_data']:
                    confidence = extraction_result['confidence_score']

                    if confidence > best_confidence:
                        combined_data = extraction_result['combined_data']
                        best_confidence = confidence

                        logger.info(f"Page processed with confidence: {confidence:.2f}")

            except Exception as e:
                logger.warning(f"Failed to process page {image_path}: {e}")
                continue

        # Step 3: Validate and clean data
        if combined_data:
            # Set document ID from filename
            combined_data['documentId'] = pdf_path.stem

            # Clean and validate
            cleaned_data = validator.clean_extracted_data(combined_data)

            # Validate against schema
            is_valid, validation_errors = validator.validate_json(cleaned_data)

            if not is_valid:
                logger.warning(f"Validation errors for {pdf_path.name}: {validation_errors}")

            # Step 4: Save results
            output_path = file_manager.save_json(cleaned_data, pdf_path.stem)

            result.update({
                'success': True,
                'data': cleaned_data,
                'confidence_score': best_confidence,
                'output_path': output_path,
                'validation_errors': validation_errors if not is_valid else None
            })

            logger.info(f"Successfully processed: {pdf_path.name}")

        else:
            raise Exception("No data extracted from any page")

        # Cleanup temporary files
        for image_path in image_paths:
            try:
                os.remove(image_path)
            except:
                pass

        try:
            temp_dir.rmdir()
        except:
            pass

    except Exception as e:
        error_msg = str(e)
        result['error'] = error_msg
        logger.error(f"Error processing {pdf_path.name}: {error_msg}")

    finally:
        end_time = datetime.now()
        result['processing_time'] = (end_time - start_time).total_seconds()

    return result


def process_batch(batch_files: list, batch_num: int) -> list:
    """
    Process a batch of files with parallel processing
    """
    logger.info(f"Starting batch {batch_num} with {len(batch_files)} files")

    batch_results = []

    with ThreadPoolExecutor(max_workers=config.MAX_WORKERS) as executor:
        # Submit all files in batch
        future_to_file = {executor.submit(process_single_pdf, file_path): file_path
                         for file_path in batch_files}

        # Process completed tasks
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                result = future.result()
                batch_results.append(result)

                # Update progress
                if result['success']:
                    progress_tracker.mark_processed(file_path.stem)
                else:
                    progress_tracker.mark_failed(file_path.stem, result['error'])

            except Exception as e:
                logger.error(f"Batch processing error for {file_path.name}: {e}")
                progress_tracker.mark_failed(file_path.stem, str(e))

    logger.info(f"Completed batch {batch_num}")
    return batch_results


print("Processing functions defined")


## 7. Test Processing (Single File)

In [None]:
# Test with a single file first
if unprocessed_files:
    print("Testing pipeline with single file...")
    test_file = unprocessed_files[0]

    print(f"Testing with: {test_file.name}")

    # Process single file
    test_result = process_single_pdf(test_file)

    print(f"\nTest Results:")
    print(f"  Success: {test_result['success']}")
    print(f"  Processing time: {test_result['processing_time']:.2f} seconds")
    print(f"  Confidence score: {test_result['confidence_score']:.2f}")

    if test_result['success']:
        print(f"  Output saved to: {test_result['output_path']}")

        # Display sample extracted data
        sample_data = test_result['data']
        print(f"\nSample extracted data:")
        print(f"  Document ID: {sample_data.get('documentId')}")
        print(f"  Issuer: {sample_data.get('issuer')}")
        print(f"  Customer: {sample_data.get('customerName')}")
        print(f"  Statement Date: {sample_data.get('statementDate')}")
        print(f"  Total Usage: {sample_data.get('totalUsage')}")
        print(f"  Locations: {len(sample_data.get('locations', []))} found")

    else:
        print(f"  Error: {test_result['error']}")

    # Ask user if they want to continue
    if test_result['success']:
        continue_processing = input("\nTest successful! Continue with full processing? (y/n): ")
        if continue_processing.lower() != 'y':
            print("Exiting pipeline.")
            sys.exit(0)
else:
    print("No unprocessed files available for testing.")

## 8. Run Full Batch Processing

In [None]:
if unprocessed_files:
    all_results = []
    total_batches = len(batches)

    for batch_num, batch_files in enumerate(batches, start=1):
        print(f"/nProcessing batch {batch_num}/{total_batches} ...")
        batch_results = process_batch(batch_files, batch_num)
        all_results.extend(batch_results)

        # Save intermediate progress after each batch
        progress_tracker.save_progress()

    print("\nAll batches processed.")
else:
    print("No files to process.")

## 9. Summary and Logs

In [None]:
# Summarize results
success_count = sum(r['success'] for r in all_results)
fail_count = len(all_results) - success_count

print(f"\nProcessing Summary:")
print(f"  Total files processed: {len(all_results)}")
print(f"  Successful: {success_count}")
print(f"  Failed: {fail_count}")

if fail_count > 0:
    print("\nFailed files and errors:")
    for r in all_results:
        if not r['success']:
            print(f" - {r['filename']}: {r['error']}")