In [14]:
import fitz

In [54]:
# PDF to Image, use OCR to detect bounding boxes of text
from pdf2image import convert_from_path
import easyocr
import numpy as np
pdf_path = 'Structured_note_termsheet/cct-linked-structured-note.pdf'
images = convert_from_path(pdf_path)
reader = easyocr.Reader(['en'])
bounds = reader.readtext(np.array(images[0]), min_size=10, slope_ths=0.5, ycenter_ths=0.7, height_ths=1.2, width_ths=0.85, decoder='beamsearch', beamWidth=10)

In [56]:
# export the image to a file
images[0].save('output.jpg', 'JPEG')

In [36]:
# Get the coordinates bounding box of the text in the PDF
def search_text_and_get_coordinates(pdf_path, search_text):
    document = fitz.open(pdf_path)
    
    start_text_coordinates = (float('inf'), float('inf'))
    end_text_coordinates = (float('-inf'), float('-inf'))
    
    for page_number in range(len(document)):
        page = document.load_page(page_number)
        
        text_instances = page.search_for(search_text)
        
        for inst in text_instances:
            x0, y0, x1, y1 = inst
            if start_text_coordinates is None:
                start_text_coordinates = (x0, y0)
            else:
              # choose the smallest x0 and y0
              start_text_coordinates = (min(start_text_coordinates[0], x0), min(start_text_coordinates[1], y0))
        
            if end_text_coordinates is None:
                end_text_coordinates = (x1, y1)
            else:
              # choose the largest x1 and y1
              end_text_coordinates = (max(end_text_coordinates[0], x1), max(end_text_coordinates[1], y1))
              
    return start_text_coordinates, end_text_coordinates

pdf_path = 'Structured_note_termsheet/cct-linked-structured-note.pdf'

search_text = 'Status of the Notes: The Notes are direct, unsecured and unsubordinated obligations of the Issuer and rank pari passu without any preference among themselves and (save for certain obligations required to be preferred by law) equally with all other unsecured obligations (other than subordinated obligations, if any) of the Issuer, from time to time outstanding.'

search_text_and_get_coordinates(pdf_path, search_text)

((86.30400085449219, 280.5127868652344), (526.9910888671875, 340.22802734375))

In [42]:
data_json = {
  "training_data": [
    {
      "text": "Status of the Notes: The Notes are direct, unsecured and unsubordinated obligations of the Issuer and rank pari passu without any preference among themselves and (save for certain obligations required to be preferred by law) equally with all other unsecured obligations (other than subordinated obligations, if any) of the Issuer, from time to time outstanding.",
      "start_text_coordinates": {
        "x": 86.30400085449219,
        "y": 280.5127868652344
      },
      "end_text_coordinates": {
        "x": 526.9910888671875,
        "y": 340.22802734375
      },
      "expected_output": "<StatusOfNotes><Description>The Notes are direct, unsecured and unsubordinated obligations of the Issuer and rank pari passu without any preference among themselves and (save for certain obligations required to be preferred by law) equally with all other unsecured obligations (other than subordinated obligations, if any) of the Issuer, from time to time outstanding.</Description></StatusOfNotes>"
    }
  ]
}

# tabulur data scaling 
# key scheching
# page breaks


In [61]:
# PDF to Images
import fitz
import os

dpi = 300
zoom = dpi/72
magnify = fitz.Matrix(zoom, zoom)

folder_name = 'standard_settlement_instructions'
file_name = 'standard-settlement-instructions.pdf'
path = f"{folder_name}/{file_name}"
count = 0

doc = fitz.open(path)

for page in doc:
    count+=1
    pix = page.get_pixmap(matrix=magnify)
    pix.save(f"Images/{file_name[:-3]}_page_{count}.png")

In [80]:
import os
import json
import pytesseract
from PIL import Image
from pathlib import Path
from uuid import uuid4

# Set the Tesseract OCR executable path
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  # Update this path if necessary

# Tesseract output levels for the level of detail for the bounding boxes
LEVELS = {
    'page_num': 1,
    'block_num': 2,
    'par_num': 3,
    'line_num': 4,
    'word_num': 5
}

def create_image_url(filepath):
    """
    Label Studio requires image URLs, so this defines the mapping from filesystem to URLs
    if you use ./serve_local_files.sh <my-images-dir>, the image URLs are localhost:8081/filename.png
    Otherwise you can build links like /data/upload/filename.png to refer to the files
    """
    filename = os.path.basename(filepath)
    return f'http://localhost:8080/{filename}'

def convert_to_ls(image, tesseract_output, per_level='block_num'):
    """
    :param image: PIL image object
    :param tesseract_output: the output from tesseract
    :param per_level: control the granularity of bboxes from tesseract
    :return: tasks.json ready to be imported into Label Studio with "Optical Character Recognition" template
    """
    image_width, image_height = image.size
    per_level_idx = LEVELS[per_level]
    results = []
    all_scores = []
    for i, level_idx in enumerate(tesseract_output['level']):
        if level_idx == per_level_idx:
            bbox = {
                'x': 100 * tesseract_output['left'][i] / image_width,
                'y': 100 * tesseract_output['top'][i] / image_height,
                'width': 100 * tesseract_output['width'][i] / image_width,
                'height': 100 * tesseract_output['height'][i] / image_height,
                'rotation': 0
            }

            words, confidences = [], []
            for j, curr_id in enumerate(tesseract_output[per_level]):
                if curr_id != tesseract_output[per_level][i]:
                    continue
                word = tesseract_output['text'][j]
                confidence = tesseract_output['conf'][j]
                words.append(word)
                if confidence != '-1':
                    confidences.append(float(confidence / 100.))

            text = ' '.join(words).strip()
            if not text:
                continue
            region_id = str(uuid4())[:10]
            score = sum(confidences) / len(confidences) if confidences else 0
            bbox_result = {
                'id': region_id, 'from_name': 'bbox', 'to_name': 'image', 'type': 'rectangle',
                'value': bbox}
            transcription_result = {
                'id': region_id, 'from_name': 'transcription', 'to_name': 'image', 'type': 'textarea',
                'value': dict(text=[text], **bbox), 'score': score}
            results.extend([bbox_result, transcription_result])
            all_scores.append(score)

    return {
        'data': {
            'ocr': create_image_url(image.filename)
        },
        'predictions': [{
            'result': results,
            'score': sum(all_scores) / len(all_scores) if all_scores else 0
        }]
    }

tasks = []
# Collect the receipt images from the image directory
for f in Path('Images').glob('*.png'):
    with Image.open(f.absolute()) as image:
        tesseract_output = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        task = convert_to_ls(image, tesseract_output, per_level='block_num')
        tasks.append(task)

# Create a file to import into Label Studio
with open(f'JSON/{file_name[:-3]}.json', mode='w') as f:
    json.dump(tasks, f, indent=2)