In [None]:
import pytesseract
import json
import os
from PIL import Image, ImageDraw, ImageColor, ImageFont
from typing import Tuple
from IPython.display import display


print("The working directory is: ", os.getcwd())
print("Please adjust it to the root of the repository so that all the path links can work properly")
#os.chdir("/Users/t.adeoti/codes-and-scripts/projects/newspaper-ocr")

# Goal: Improve OCR performance

- Image path: `data/jpg/december_1994`
- Layout path: `layout/december_1994`

You can explore:
1. Using other libraries asides pytesseract for the OCR: the OCR is done using the line of code `pytesseract.image_to_string(cropped_image)`
2. Improving the image quality before OCR: one has to research which image transformations improve OCR performance

## Display Image with Layout

In [None]:
def load_json_file(file_path):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def draw_layout_rectangles(image, layout):
    
    layout_info = layout["layout"]
    draw = ImageDraw.Draw(image)


    for block in layout_info:
        coordinates = block['coordinates']
        block_type = block['layout_type']
        block_color = block['block_color']
        reading_order = block['reading_order']

        # Convert block color to RGB if it's a string color representation
        if isinstance(block_color, str):
            block_color = ImageColor.getrgb(block_color)

        # Draw rectangle
        draw.rectangle(coordinates, outline=tuple(block_color), width=3)

        # Label the rectangle with layout information
        label_position = (coordinates[0], coordinates[1] - 20) 
        draw.text(label_position, f"{block_type}_{reading_order}", fill="red")

    return image


def get_image_and_layout(desired_image):
    image_path = f"data/jpg/december_1994/{desired_image}"
    layout_path = f"layout/december_1994/{desired_image.replace('.jpeg', '_layout.json')}"
    image = Image.open(image_path)
    layout = load_json_file(layout_path)

    return image, layout

## 1. Choose Image from the list of all Images

In [None]:
all_imgs_names = os.listdir("data/jpg/december_1994")
desired_image_index = int(input(f"Image Selection: Choose a number between 0 and {len(all_imgs_names)}"))
desired_image = all_imgs_names[desired_image_index]
image, layout = get_image_and_layout(desired_image)
draw_layout_rectangles(image, layout)

# Perform OCR

In [None]:
from PIL import Image
import pytesseract
from IPython.display import display
from typing import Dict, Union

def perform_ocr(img: Image.Image, layout: Dict[str, Union[str, list]], block_number: int) -> Dict[str, Union[str, int, str]]:
    """
    Perform Optical Character Recognition (OCR) on a cropped image.

    Parameters:
    - img (PIL.Image.Image): The input image.
    - layout (dict): Layout information for the document.
    - block_number (int): The index of the block to process.

    Returns:
    dict: Dictionary containing OCR results and block information.
    """
    layout_info = layout["layout"]
    block = layout_info[block_number]

    coordinates = block['coordinates']
    block_type = block['layout_type']
    reading_order = block['reading_order']

    # Crop the image section within the bounding box
    cropped_image = img.crop(coordinates)

    # Display the cropped image in Jupyter Notebook
    display(cropped_image)

    # Perform OCR using pytesseract
    ocr_result = pytesseract.image_to_string(cropped_image)

    return {
        "block_type": block_type,
        "reading_order": reading_order,
        "ocr_result": ocr_result
    }


# 2. Choose the Block you want to Perform OCR for

In [None]:
block_number = int(input(f"Block Selection: Choose a block number between 0 and {len(layout_info)} for OCR")) # choose one of the blocks to perform OCR on

perform_ocr(image, layout, block_number)