In [None]:
import PyPDF2
from structml import line_heal
import os
import re
import shutil
from rich.progress import track
import tika
tika.initVM()

from tika import parser

work_dir = "./pdf_temp/"

pdf_path = "/home/rstewart/Downloads/20120001369.pdf"


if not os.path.exists(work_dir):
    os.makedirs(work_dir)

# Delete all files in the work directory
for filename in os.listdir(work_dir):
    file_path = os.path.join(work_dir, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))

def split_pdf_into_pages(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)
        
        for i in range(num_pages):
            pdf_writer = PyPDF2.PdfWriter()
            pdf_writer.add_page(pdf_reader.pages[i])
            
            output_filename = f'page_{i+1}.pdf'
            with open(work_dir + output_filename, 'wb') as output_pdf:
                pdf_writer.write(output_pdf)


def extract_text_from_pdf(pdf_path):
    raw_text = ""

    parsed_pdf = parser.from_file(pdf_path)

    if parsed_pdf['content']: # type: ignore
        raw_text = parsed_pdf['content'] # type: ignore
    
    # Strip leading and trailing whitespace
    raw_text = raw_text.strip()

    # Replace muktiple newlines with a single newline
    raw_text = re.sub(r'\n+', '\n', raw_text)

    return raw_text


split_pdf_into_pages(pdf_path)

output_text = []

# Extract text from each page and add it to a dictionary with the keys being the page number and the values being the text split into lines (as a list)
for filename in track(os.listdir(work_dir)):
    if filename.endswith(".pdf"):
        page_number = int(filename.split("_")[1].split(".")[0])
        
        # Split the text into lines and add it to the dictionary
        output_text.append({
            "page_number": page_number,
            "text": extract_text_from_pdf(work_dir + filename).split("\n")
        })

# Sort the text by page number
output_text.sort(key=lambda x: x["page_number"])

# Remove any pages that contain nothing
output_text = [page for page in output_text if page["text"]]

# Remove any pages that contain only empty lines
output_text = [page for page in output_text if any([line for line in page["text"] if line])]

# Compare the first 6 lines from each page against the first 10 lines from every other page, if more than 50% of the characters match, then assume that the line is a header and remove it and all the matching lines from the other pages.
for i in range(len(output_text)):
    for j in range(len(output_text)):
        if i == j:
            continue
        for line in output_text[i]["text"][:6]:
            matching_characters = 0
            for other_line in output_text[j]["text"][:6]:
                # Calculate the number of characters that match
                matching_characters = sum([1 for char1, char2 in zip(line, other_line) if char1 == char2])
                # If more than 50% of the characters match, then assume that the line is a header and remove it and all the matching lines from the other pages
                if matching_characters / len(line) > 0.5:
                    # Replace line with an empty string
                    output_text[i]["text"] = [l for l in output_text[i]["text"] if l != line]

            if matching_characters / len(line) > 0.5:
                output_text[i]["text"] = [l for l in output_text[i]["text"] if l != line]

            # If the line is just numbers, then remove it
            if re.match(r"^\d+$", line):
                output_text[i]["text"] = [l for l in output_text[i]["text"] if l != line]

# Do the same as above, but for the last 6 lines of each page
for i in track(range(len(output_text))):
    for j in range(len(output_text)):
        if i == j:
            continue
        for line in output_text[i]["text"][-6:]:
            matching_characters = 0
            for other_line in output_text[j]["text"][-6:]:
                matching_characters = sum([1 for char1, char2 in zip(line, other_line) if char1 == char2])
                if matching_characters / len(line) > 0.5:
                    output_text[i]["text"] = [l for l in output_text[i]["text"] if l != line]

            if matching_characters / len(line) > 0.5:
                output_text[i]["text"] = [l for l in output_text[i]["text"] if l != line]

            # If the line is just numbers, then remove it
            if re.match(r"^\d+$", line):
                output_text[i]["text"] = [l for l in output_text[i]["text"] if l != line]

           
# Join each line with a newline character and each page with a triple newline character
output_text = "\n\n\n".join(["\n".join(page["text"]) for page in output_text])

# Use structml to convert the text into a structured format
# output_text = line_heal.parse(output_text, verbose=True)

print(output_text)

