In [78]:
import pandas as pd
from anthropic import Anthropic
from pdf2image import convert_from_path
from PIL import Image
from fuzzywuzzy import fuzz
import pytesseract
import base64

from io import BytesIO
from typing import Union, Optional
from pathlib import Path

In [5]:
API_KEY = '<API KEY>'

In [7]:
PDF_SCRAP_PROMPT = "Please extract all text from this image exactly as \
                    it appears, preserving all formatting and line breaks. \
                    Do not generate any preamble or descriptions."

client = Anthropic(api_key=API_KEY)

In [79]:
def read_pdf_file(file_path: Union[str, Path]) -> bytes:
    """Read PDF file and return raw bytes."""
    with open(file_path, 'rb') as file:
        return file.read()

def create_message_with_single_page(client, pdf_data, prompt, max_tokens):
        message = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=max_tokens,
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_data
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }]
        )
        return message, message.content[0].text

## Scrap with Claude pdf skill

In [None]:
from PyPDF2 import PdfReader, PdfWriter
import os  # Import os for directory handling
import time  # Import time for latency measurement
import base64
from io import BytesIO

pdf_path = 'bagel_jays.pdf'
prompt = PDF_SCRAP_PROMPT

def process_pdf_pages(client, pdf_path, prompt, max_tokens, page_number=None):
    pdf_reader = PdfReader(pdf_path)
    total_pages = len(pdf_reader.pages)
    
    if page_number is not None:
        if not 1 <= page_number <= total_pages:
            raise ValueError(f"Page number must be between 1 and {total_pages}")
        pages_to_process = [page_number - 1]  # Convert to 0-based index
    else:
        pages_to_process = range(total_pages)

    results = []
    os.makedirs('ground_truth', exist_ok=True)

    for page_idx in pages_to_process:
        writer = PdfWriter()
        writer.add_page(pdf_reader.pages[page_idx])
        
        page_buffer = BytesIO()
        writer.write(page_buffer)
        page_buffer.seek(0)
        
        pdf_data = base64.b64encode(page_buffer.read()).decode()
        
        start_time = time.time()
        message, message_text = create_message_with_single_page(client, pdf_data, prompt, max_tokens)
        end_time = time.time()
        latency = end_time - start_time
        
        results.append((message_text, latency))
        
        with open(f'ground_truth/claude_pg{page_idx + 1}.txt', 'w', encoding='utf-8') as f:
            f.write(message_text)  # Store the extracted text

    return results


In [66]:
with open('results/latency.txt', 'a') as file:
    for i, (m, l) in enumerate(results):
        file.write(f"Pg {i}: {l}\n")

## Scrap with Tesseract 

In [71]:
def extract_text_tesseract_all_pages(pdf_path):
    os.makedirs('tesseract_output', exist_ok=True)
    # Iterate through all pages of the PDF
    images = convert_from_path(pdf_path)
    for page_number, image in enumerate(images, start=1):
        if image:
            # Use pytesseract to read information from the image
            text = pytesseract.image_to_string(image)
            # Save the extracted text to a file
            with open(f'tesseract_output/tesseract_pg{page_number}.txt', 'w', encoding='utf-8') as f:
                f.write(text)
        else:
            print(f"Failed to convert PDF page {page_number} to image.")

In [72]:
pdf_path = 'bagel_jays.pdf'
extract_text_tesseract_all_pages(pdf_path)

## Compare outputs: Claude vs. Tesseract

In [73]:
def compare_ground_truth_stored(pdf_path, start_page=1, end_page=None, ocr_output_dir='tesseract_output'):
    results = []
    for page_number in range(start_page, end_page + 1 if end_page else start_page + 1):
        ground_truth_path = f'ground_truth/claude_pg{page_number}.txt'
        # Read the ground truth text from claude's output
        with open(ground_truth_path, 'r', encoding='utf-8') as f:
            ground_truth = f.read()

        # Use pre-saved OCR output instead of making a pytesseract call
        ocr_output_path = f'{ocr_output_dir}/tesseract_pg{page_number}.txt'
        with open(ocr_output_path, 'r', encoding='utf-8') as f:
            tesseract_output = f.read()

        # Calculate different fuzzy matching scores
        ratio_score = fuzz.ratio(ground_truth, tesseract_output)
        partial_ratio = fuzz.partial_ratio(ground_truth, tesseract_output)
        # token_sort_ratio = fuzz.token_sort_ratio(ground_truth, tesseract_output)
        # token_set_ratio = fuzz.token_set_ratio(ground_truth, tesseract_output)

        # Store the results
        results.append({
            "Page Number": page_number,
            "Overall Similarity (ratio)": ratio_score,
            "Partial String Similarity": partial_ratio,
            # "Token Sort Similarity": token_sort_ratio,
            # "Token Set Similarity": token_set_ratio
        })

    # Convert results to DataFrame
    df_results_saved = pd.DataFrame(results)
    return df_results_saved

In [80]:
res = compare_ground_truth_stored(pdf_path, end_page=46)
res

Unnamed: 0,Page Number,Overall Similarity (ratio),Partial String Similarity
0,1,87,91
1,2,88,99
2,3,95,95
3,4,96,99
4,5,98,100
5,6,98,98
6,7,89,91
7,8,98,99
8,9,99,100
9,10,59,59


In [81]:
res.describe()

Unnamed: 0,Page Number,Overall Similarity (ratio),Partial String Similarity
count,46.0,46.0,46.0
mean,23.5,82.565217,84.217391
std,13.422618,22.295642,24.620599
min,1.0,19.0,13.0
25%,12.25,79.5,85.25
50%,23.5,91.5,96.0
75%,34.75,98.0,99.0
max,46.0,100.0,100.0


## Analysis & Notes

- First, and most pressing concern is that I do not have an understanding of what the accruacy measures mean in terms of accuracy
- Secondly, our evaluation hinges on claude's output being a good representation of the ground truth
- How do it effectively present the accuracy measures' statistics in a way that I support both the pros and the cons that come with a given ocr

Next steps
- understand the two metrics being used, Overall Similarity (ratio)	& Partial String Similarity, and work through an example of a single page.
    - curious to see hwo the newline characters play into accuracy as well as how they are weighted.