In [None]:
import cv2
import numpy as np
import os
from tqdm import tqdm
import xml.etree.ElementTree as ET
from result_helpers import * 

from ledger import Ledger
from compare_csv import compare
from colours import *

In [None]:
folder = "input_jpg_xml"

ledgers = []
for root, dirs, files in os.walk(folder):
    for file in files:
        if file.endswith(".jpg") and file.startswith("WBM"):
            l = Ledger(os.path.join(root, file), str(file.strip(".jpg")))
            ledgers.append(l)

print(len(ledgers), "images found")

In [None]:
# Split double pages into two
double_pages = [l for l in ledgers if l.is_double_page()]
print(len(double_pages), "double pages found")
for ledger in tqdm(double_pages):
    left, right = ledger.split_into_two()
    ledgers.remove(ledger)
    ledgers.append(left)
    ledgers.append(right)

In [None]:
from matplotlib import pyplot as plt

new_ledgers = []
for led in tqdm(ledgers):
    # Get image and make the image grayscale
    im = led.cropped_im.copy()
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    
    # Take the average colour of each row in the led.contrast_im between 10% and 90% of the width   
    row_colours = []
    for row in range(im.shape[0]):
        row_colours.append(np.mean(im[row, int(im.shape[1]*0.1):int(im.shape[1]*0.9)], axis=0))

    row_colours = np.array(row_colours)
    STEP_SIZE = 400
    i = STEP_SIZE
    regions = []
    while i < len(row_colours):
        # Calculte the sum of the differences between each row colour and the previous row colour
        diff = 0
        for j in range(STEP_SIZE):
            diff += np.sum(np.abs(row_colours[i-j] - row_colours[i-j-1]))
            if diff > STEP_SIZE/15:
                break

        # If the difference is greater than 1000, draw a line
        if diff < STEP_SIZE/15: 
            regions.append((i-STEP_SIZE, i))
            # Draw a area from start to end
            start = i - STEP_SIZE
            end = i-1
            i += STEP_SIZE
        i += 1

    i = 0
    while i < len(regions):
        if regions[i][1] + 2* STEP_SIZE > len(row_colours):
            regions[i] = (regions[i][0], len(row_colours))
        if i + 1 < len(regions) and regions[i+1][0] - regions[i][1] < 10:
            regions[i] = (regions[i][0], regions[i+1][1])
            regions.pop(i+1)
        else:
            i += 1
            
    splits = []
    for region in regions:
        if region[1] < len(row_colours):
            splits.append((region[0] + region[1])//2)

    if not splits:
        new_ledgers.append(led)

    for i, split in enumerate(splits[::-1]):
        # If it's the last split
        if i + 1 == len(splits):
            top, bottom = led.horizontal_split(split, f"-{len(splits)-1-i}", f"-{len(splits)-i}")
            new_ledgers.append(top)
            new_ledgers.append(bottom)
        else:
            top, bottom = led.horizontal_split(split, "", f"-{len(splits)-i}")
            led = top
            new_ledgers.append(bottom)

print(len(new_ledgers), "images found. Was ", len(ledgers))

In [None]:
ledgers = new_ledgers

In [None]:
# Turn each ledger into two by spliting it along the middle line
new_ledgers = []
for led in (ledgers):
    # led.remove_borders()
    led.find_vertical_lines()

    if len(led.vert_lines) < 10:
        continue

    try:
        led.find_middle_line()
    except:
        print("Cannot find middle line for", led.id)
        cv2.imwrite(f"not_split/{led.id}.jpg", led.cropped_im)
        continue

    ml = led.middle_line
    ml_i = led.middle_line_index
    left_lines = led.vert_lines[:ml_i]
    right_lines = led.vert_lines[ml_i:]


    end_left = max(ml[0], ml[1])
    start_right = min(ml[0], ml[1])

    left = led.cropped_im[:, :end_left]
    right = led.cropped_im[:, start_right:]
    
    print("left", left.shape, "right", right.shape, ml_i, len(led.vert_lines), led.id)

    left_led = Ledger.from_image(left, ledger_id=f"{led.id}-l")
    left_led.cropped_left = led.cropped_left
    left_led.cropped_top = led.cropped_top
    left_led._vert_lines = left_lines
    new_ledgers.append(left_led)

    right_led = Ledger.from_image(right, ledger_id=f"{led.id}-r")
    right_led.cropped_left = start_right + led.cropped_left
    right_led.cropped_top = led.cropped_top
    right_led._vert_lines = right_lines
    new_ledgers.append(right_led)

    im = np.copy(led.cropped_im)
    for line in right_lines:
        cv2.line(im, (line.top, 0), (line.bottom, im.shape[0]), (0, 0, 255), 10)
    for line in left_lines:
        cv2.line(im, (line.top, 0), (line.bottom, im.shape[0]), (0, 255, 0), 10)
    cv2.imwrite(f"2/{led.id}.jpg", im)

In [None]:
ledgers = new_ledgers

In [None]:
def precalc_ledger(ledger):
    ledger.contrast_im
    ledger.find_vertical_lines()
    ledger.check_vertical_lines()

    im = ledger.cropped_im.copy()
    for top, bottom in ledger.vert_lines:
        cv2.line(im, (top, 0), (bottom, im.shape[0]), RED, 5)

    cv2.imwrite(f"_tmp_loghi/{ledger.id}.jpg", im)
    return ledger


for led in tqdm(ledgers):
    try:
        led = precalc_ledger(led)
    except Exception as e:
        print("Error in", led.id)
        print(e)


In [None]:
not_found = ["WBMB00038000380-r", "WBMB00048000470-r", "WBMB00048000440-r", " WBMB00018000040-r", "WBMB00028000370-r"]
# for led in ledgers:
    # if led.id in not_found:
        # ledger.del(led)

In [None]:
# Ensure loghi has run by checking if '_tmp_loghi/page' has a xml file for
# each image in '_tmp_loghi'
all_imgs = os.listdir("_tmp_loghi")
all_imgs = [i for i in all_imgs if i.endswith(".jpeg")]

all_xmls = os.listdir("_tmp_loghi/page")
all_xmls = [i[:-3] for i in all_xmls if i.endswith(".xml")]



assert [i[:-3] in all_xmls for i in all_imgs], "Loghi has not run on all images"

In [None]:
def get_offset(led, line):
    vl = led.vert_lines
    return vl[line][0]

# Define color mapping for bins
color_map = {
    0: RED, 1: BLUE, 2: GREEN, 3: ORANGE, 
    4: CYAN, 5: MAGENTA, 6: WHITE, 7: BLACK, 
    8: GREY, 9: ORANGE
}

all_results = []

# Process each ledger
for ledger in tqdm(ledgers):
    try:
        # Load the corresponding image and XML file
        image_path = f"_tmp_loghi/{ledger.id}.jpg"
        xml_path = f"_tmp_loghi/page/{ledger.id}.xml"
        cur_image = cv2.imread(image_path)
        tree = ET.parse(xml_path)
    except FileNotFoundError:
        print(f"Error: Missing file(s) for ledger ID {ledger.id}")
        continue

    root = tree.getroot()
    namespace = get_ns(root)

   
    
    # Define bin borders based on vertical lines
    bin_borders = [0] + [line.top for line in ledger.vert_lines] + [ledger.cropped_im.shape[1]]
    if not 6 <= len(bin_borders) <= 8:
        print(f"Error: Invalid number of bins ({len(bin_borders)}) for ledger ID {ledger.id}")

        for t, b in ledger.vert_lines:
            cv2.line(cur_image, (t, 0), (b, cur_image.shape[0]), (0, 0, 255), 5)

        cv2.imwrite(f"no_bins/{ledger.id}.jpeg", cur_image)
        continue


    # Initialize bins for text lines
    bins = [[] for _ in range(10)]

    # Classify text lines into bins
    for text_line_element in root.findall(f".//{namespace}TextLine"):
        text_line = LineOfText(text_line_element, namespace)
        
        weird_char = [',', '!', '?', '(', ')', '[', ']', '{', '}', '<', '>', '|', 
                      '\\', '/', '*', '+', '=', '&', '%', '$', '#', '@', '^', '~', 
                      '`', '"', "'", ':', ';', ' ', '-', '_', '.']
        if all(i in weird_char for i in text_line.plain_text):
            continue

        # Check in which bin the baseline is.
        bin_num = next(i for i, border in enumerate(bin_borders) if text_line.avg_x < border)-1


        # Determine the bin for the current text line
        try:
            bin_index = next(
                i for i, border in enumerate(bin_borders) if text_line.avg_x < border
            ) - 1
        except StopIteration:
            continue

        # Skip short text in bin 1
        if bin_index == 1 and len(text_line.plain_text.strip()) < 2:
            continue




        bins[bin_index].append(text_line)

        
        SHRINK_PX = 40 if bin_index == 1 else 20
        # Draw baseline for lines in their respective bins as a rectangle
        cv2.rectangle(
            cur_image, 
            (text_line.reg_min_x + SHRINK_PX, text_line.reg_min_y + SHRINK_PX), 
            (text_line.reg_max_x- SHRINK_PX, text_line.reg_max_y-SHRINK_PX), 
            color_map[bin_index], 2
        )

        # cv2.polylines(cur_image, [np.array(text_line.points)], False, color_map[bin_index], 2)

    # Process subtotal lines (bin 3 expected to have numbers)
    subtotal_items = []
    ledger.find_subtotal_lines(0.725)
    for left_line, right_line, row1, row2 in ledger.subtotal_lines:
        closest_line = None
        min_distance = float('inf')
        subtotal_y_value = min(row1, row2)
        has_value_above = False

        # Find the closest line below the subtotal line in bin 3
        for text_line in bins[3]:
            if text_line.avg_y < subtotal_y_value + 25:
                has_value_above = True
                continue

            distance = abs(text_line.avg_y - subtotal_y_value)
            if distance < min_distance:
                min_distance = distance
                closest_line = text_line

        # Draw the subtotal line
        cv2.line(cur_image, (left_line[0], row2), (right_line[0], row1), BLUE, 5)

        # Highlight the value below the subtotal line
        if closest_line and min_distance < 175 and has_value_above:
            # cv2.polylines(cur_image, [np.array(closest_line.points)], False, (0,255,0), 10)
            cv2.rectangle(
                cur_image, 
                (closest_line.reg_min_x + SHRINK_PX, closest_line.reg_min_y + SHRINK_PX), 
                (closest_line.reg_max_x- SHRINK_PX, closest_line.reg_max_y-SHRINK_PX), 
                BLUE, 5
            )
            subtotal_items.append((closest_line, text_line))
            bins[3].remove(closest_line)

    # Merge close text lines in bin 1
    bins[1].sort(key=lambda x: x.avg_y)
    merged_bin_1 = []
    DIST_THRESHOLD = 600

    for idx, text_line in enumerate(bins[1]):

        if idx == 0:
            merged_bin_1.append(text_line)
            continue

        prev_line = merged_bin_1[-1]
        last_point_prev = prev_line.points[-1]
        first_point = text_line.points[0]

        # Compute distance between lines
        dist = np.linalg.norm(np.array(last_point_prev) - np.array(first_point))
        delta_y = abs(last_point_prev[1] - first_point[1])

        if dist < DIST_THRESHOLD and delta_y < 35:
            prev_line.plain_text += " " + text_line.plain_text
            merged_bin_1[-1] = prev_line
            cv2.line(cur_image, last_point_prev, first_point, (0, 155, 0), 2)
        else:
            merged_bin_1.append(text_line)

    bins[1] = merged_bin_1

    # Generate table rows and map lines from bins to rows
    table_rows = [["", line, "", "", "", ""] for line in bins[1] if len(line.plain_text) > 1]

    for bin_index in [0, 2, 3, 4]:
        for text_line in bins[bin_index]:
            closest_row = min(
                table_rows, key=lambda row: abs(row[1].avg_y - text_line.avg_y), default=None
            )

            if closest_row:
                closest_row[bin_index] = text_line

    # # Add empty rows with just the subtotal in the correct spots
    for i, (cl, line) in enumerate(subtotal_items):
        # Loop over the table rows and find the two rows where the subtotal should fit in between.
        placed = False
        for j, row in enumerate(table_rows):
            if not row[1]:
                continue

            if row[1].avg_y > cl.avg_y:
                table_rows.insert(j, ["", "", "", cl, "", ""])
                placed = True
                break

        if not placed:
            table_rows.append(["", "", "", cl, "", ""])

    # Save table_rows as a Results object
    results = []
    for row in table_rows:
        # Skip empty rows
        if all(cell == "" or not cell or cell is None for cell in row):
            print("Skipped an empty row")
            continue

        # Create a copy of the row for modifications
        row_copy = row.copy()
        date_cell = row[0]

        # Check and adjust date placement
        if not date_cell and row[1]:
            # If the second column starts with a number, treat it as a date and move it to the first column
            words = row[1].plain_text.split()
            numeric_words = [i for i in words if any(char.isnumeric() for char in i)]
            
            if numeric_words:
                split_index = words.index(numeric_words[-1]) + 1
                possible_date = " ".join(words[:split_index])

                # Update the date in the first column
                row_copy[0] = LineOfText.from_string(possible_date)
                # Remove the moved date from the second column
                row_copy[1].plain_text = row_copy[1].plain_text[len(possible_date) + 1 :]

        # Draw lines between cells in the same row
        prev_cell = None
        for i in range(0, len(row_copy)):
            if type(row_copy[i]) == LineOfText and row_copy[i].line is not None:
                if not prev_cell:
                    prev_cell = row_copy[i]
                    continue

                cv2.line(
                    cur_image, 
                    (prev_cell.max_x, prev_cell.avg_y), 
                    (row_copy[i].min_x, row_copy[i].avg_y), 
                    (0, 0, 0), 2
                )
                prev_cell = row_copy[i]


        # Convert the row into a structured format
        structured_row = [
            Cell(
                cell.plain_text,
                [(x + led.cropped_left, y + led.cropped_top) for x, y in cell.region_points]
            ) if cell else Cell(None)
            for cell in row_copy
        ]

        # Add the structured row to the results
        results.append(ResultsRow.from_row(structured_row))
    cv2.imwrite(f"out_img/{ledger.id}.jpeg", cur_image)

    # Append the results for the current ledger to the final collection
    all_results.append(Results(results, ledger.id))
    break


In [None]:
for r in all_results:
    # Save the results as a CSV file
    a = r.to_csv()
    print(a, file=open(f"out_csv/{r.name}.csv", "w"))

In [None]:
import pickle
from ledger import Ledger
# Remove previusly saved results
if os.path.exists("all_results.pkl"):
    os.remove("all_results.pkl")

with open("all_results.pkl", "wb") as f:
    pickle.dump(all_results, f)