# Initial Attempt to Apply Tesseract on Example Image

In [1]:
import pytesseract
from pdf2image import convert_from_path
import os
import cv2
import numpy as np
from PIL import Image

# Path to the PDF file
data_path = os.path.join(os.getcwd(), "data")
pdf_path = os.path.join(data_path, "Example_HistoricalDoc.pdf")


In [2]:
# Convert PDF pages to images
images = convert_from_path(pdf_path)

In [3]:
image_paths = []

for i, image in enumerate(images):
    # Save the PDF as images
    image_path = f'page_{i+1}.png'
    image_paths.append(os.path.join(data_path, image_path))
    image.save(os.path.join(data_path, image_path), 'PNG')

In [27]:
processed_path = []

# The following is the standard preprocessing for images

for i,path in enumerate(image_paths):
    # Load the image
    image = cv2.imread(path)

    # Define the scale factor for upscaling
    scale_factor = 2 

    # Upscale the image
    upscaled_image = cv2.resize(image, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)

    # Convert the image to grayscale
    gray_image = cv2.cvtColor(upscaled_image, cv2.COLOR_BGR2GRAY)


    # Apply Gaussian blur for denoising
    blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

    # Apply Otsu's thresholding for binarization
    _, thresholded_image = cv2.threshold(blurred_image, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    processed_path.append(os.path.join(data_path, f"processed_image_{i+1}.png"))

    # Save the processed image
    cv2.imwrite(os.path.join(data_path, f"processed_image_{i+1}.png"), thresholded_image)

In [26]:
###OPTIONAL###

croped_path = []

# The following is an option but not used in the rest of the code

for i,path in enumerate(processed_path):
    # Load the image
    image = cv2.imread(path)

    # Define the coordinates of the top-left corner of the ROI
    x, y = 250, 475 

    # Define the width and height of the ROI
    width, height = 3050, 1250  # Example width and height

    # Crop the image to the specified ROI
    cropped_image = image[y:y+height, x:x+width]
    # Upscale the image
    upscaled_image = cv2.resize(cropped_image, None, fx=4, fy=4, interpolation=cv2.INTER_LINEAR)
    # Save or display the cropped image
    croped_path.append(os.path.join(data_path, f'crop_image_{i+1}.png'))
    cv2.imwrite(os.path.join(data_path, f'crop_image_{i+1}.png'), upscaled_image)

In [29]:
# Use the standard Tesseract Model

out_path = []

for i, path in enumerate(processed_path):
    # Load image
    image = Image.open(path)

    # Extract text
    text = pytesseract.image_to_string(image, config=f'--psm {6}')
    output_file = f'output_{i+1}.txt'

    out_path.append(os.path.join(data_path,"..","output",output_file))
    
    with open(os.path.join(data_path,"..","output",output_file), 'w') as f:
        f.write(text)

In [34]:
# There is a lot of noise in the extracted numbers, we will attempt to format here.

def format_numbers(text):
    lines = text.split('\n')
    formatted_lines = []

    for line in lines:
        # Skip empty lines
        if not line.strip():
            continue

        numbers = line.split()

        if len(numbers) < 6:
            continue
        first_number = numbers[0]
        # Check if the first number is less than 3 digits
        if len(first_number) < 3 or first_number[-1] == ".":
            # get rid of noise number
            new_first_number = numbers[1]
            # Remove the first and second numbers from the list
            numbers.pop(0)
            numbers.pop(0)
            # Insert the new first number at the beginning of the list
            numbers.insert(0, new_first_number)
        # Join the numbers back into a line
        formatted_line = ' '.join(numbers)
        formatted_lines.append(formatted_line)

    # Join the lines back into a single string
    formatted_text = '\n'.join(formatted_lines)
    return formatted_text

In [35]:
import re

# Define a function to replace non-numeric characters with space

def replace_non_numeric(match):
    return ' ' if not match.group().replace('.', '').isdigit() else match.group()

formatted = []

# Use Regex to lemmatize

for i, path in enumerate(out_path):
# Read the text from the file
    with open(path, 'r') as file:
        text = file.read()

    # Replace non-numeric characters with space

    text_with_spaces = re.sub(r'\b(?:\d+\.\d+|\d+)\b|[^\s\d.]+', replace_non_numeric, text)

    text_without_periods = re.sub(r'(?<![0-9])\.(?![0-9])', ' ', text_with_spaces)

    formatted.append(text_without_periods)


# Format the output in a more suitable way
count = 1
for t in formatted:
    output_file = f'formatted_output_{count}.txt'
    count +=1
    text = format_numbers(t)
    with open(os.path.join(data_path,"..","output", output_file), 'w') as f:
        f.write(text)
