In [1]:
from pathlib import Path

import pandas as pd

from samisk_ocr.transkribus.export_to_prediction_file import get_line_transcriptions
from samisk_ocr.transkribus.map_transkribus_lines_to_gt_lines import (
    find_image_with_biggest_bbox_overlap,
    line_image_dir_to_urn_line_bbox_df,
)
from samisk_ocr.utils import image_stem_to_pageurn_line_bbox

p1 = Path("../../data/transkribus_exports/predictions/val_set/our_line_level_layout_w_lm")
p2 = Path("../../data/transkribus_exports/predictions/val_set/our_line_level_layout_wo_lm")

gt_image_dir = Path("../../data/samisk_ocr_dataset/val")
gt_df = line_image_dir_to_urn_line_bbox_df(image_dir=gt_image_dir)

gt_df_metadata = pd.read_csv("../../data/samisk_ocr_dataset/val/_metadata.csv")
gt_df = gt_df_metadata.merge(gt_df, left_on=["file_name", "line"], right_on=["image", "line"])
assert len(gt_df_metadata) == len(gt_df)

example_bbox_suffix = "_0248_0393_1731_0416"

diff_dfs = []
gt_diff = None

for p in (p1, p2):
    # print(p.name)
    for model_dir in p.iterdir():
        if model_dir.name == "old":
            continue
        prediction_df = get_line_transcriptions(model_dir, keep_source_imgs=True)
        transkribus_image_stems = prediction_df.image.apply(lambda x: Path(x).stem)
        page_urns, lines, bboxes = zip(
            *transkribus_image_stems.apply(image_stem_to_pageurn_line_bbox)
        )
        prediction_df["page_urn"] = page_urns
        prediction_df["line"] = lines
        prediction_df["bbox"] = bboxes

        df = gt_df.merge(prediction_df, on=["page_urn", "line", "bbox"])

        # print("\t", model_dir.name)
        # print(
        # f"\t Antall rader med lik bbox {len(df)} av {len(gt_df)}  ({round((len(df)/len(gt_df)*100),2)}%)\n"
        # )
        df_diff = prediction_df.merge(
            gt_df, on=["page_urn", "line", "bbox"], how="outer", indicator=True
        )
        df_diff = df_diff[df_diff["_merge"] == "left_only"].drop(columns=["_merge"])[
            ["transcription", "page_urn", "line", "bbox", "source_image"]
        ]
        diff_dfs.append(df_diff)

        if gt_diff is None:
            gt_diff = gt_df.merge(
                prediction_df, on=["page_urn", "line", "bbox"], how="outer", indicator=True
            )
            gt_diff = gt_diff[gt_diff["_merge"] == "left_only"].drop(
                columns=["_merge", "image_y", "transcription", "source_image", "image_x"]
            )
        # print(
        #     f"\t Antall rader med ulik bbox {len(df_diff)} av {len(gt_df)}  ({round((len(df_diff)/len(gt_df)*100),2)}%)\n"
        # )

        # for page_urn, df_ in df_diff.groupby("page_urn"):
        # print(f"\t\t Sideurn: {page_urn}\n\t\t Antall linjebokser forskjellig: {len(df_)}")

In [None]:
for i in range(len(diff_dfs)):
    assert all(diff_dfs[i - 1] == diff_dfs[i])
diff_df = diff_dfs[0]
diff_df.index = range(len(diff_df))
diff_df.head(5)

In [None]:
gt_diff["image"] = gt_diff.file_name
gt_diff.index = range(len(gt_diff))
gt_diff.head(2)

In [None]:
len(diff_df) == len(gt_diff)

In [5]:
for tup in diff_df.itertuples():
    diff_df.at[tup.Index, "gt_image"] = find_image_with_biggest_bbox_overlap(
        bbox=tup.bbox, other_df=gt_diff
    )

In [6]:
diff_df = diff_df.merge(
    gt_diff, left_on="gt_image", right_on="file_name", suffixes=["_transkribus", "_gt"]
)

In [None]:
from PIL import Image

p = Path("../../data/transkribus_exports/predictions/val_set/our_line_level_layout_w_lm/smi")

for tup in diff_df.itertuples():
    img = Image.open(p / tup.source_image)
    img2 = img.crop(tup.bbox_transkribus)
    img3 = img.crop(tup.bbox_gt)

    print(f"Transkribus box {tup.bbox_transkribus}")
    print(f"Transcription: {tup.transcription}")

    display(img2)

    print(f"GT box {tup.bbox_gt}")
    print(f"Annotation: {tup.text}")
    display(img3)