In [1]:
from pathlib import Path

from samisk_ocr.transkribus.export_to_prediction_file import get_line_transcriptions

# Transkribus export we used to build test dataset
annotation_export = Path("../../data/transkribus_exports/check/Testsett_Samisk_OCR_nytest")
annotation_df = get_line_transcriptions(annotation_export, keep_source_imgs=True)

# Transkribus export containing transkribus model predictions on test data
prediction_export = Path("../../data/transkribus_exports/check/Testsett_Samisk_OCR_nytest_preds")
prediction_df = get_line_transcriptions(prediction_export, keep_source_imgs=True)
assert len(annotation_df) == len(prediction_df)

In [2]:
# Last part of image filenames contain line polygon bbox (extracted from alto-xml files)
example_bbox_suffix = "0248_0393_1731_0416"


def get_bbox(file_stem: str) -> tuple[int, int, int, int]:
    return tuple(int(e) for e in file_stem[-len(example_bbox_suffix) :].split("_"))  # type: ignore


example = prediction_df.image[0]
print(example)
get_bbox(Path(example).stem)

URN_NBN_no-nb_pliktmonografi_000007243_0098_000_0381_0398_2130_0486.jpg


(381, 398, 2130, 486)

In [3]:
same_bboxes = annotation_df.merge(prediction_df, on="image")

print(
    f"Of {len(annotation_df)} lines, {len(same_bboxes)} have the same bbox in the prediction export as the annotation export"
)

Of 791 lines, 700 have the same bbox in the prediction export as the annotation export


In [4]:
from samisk_ocr.transkribus.map_transkribus_lines_to_gt_lines import (
    find_image_with_biggest_bbox_overlap,
)

# Find the lines where bboxes differ between annotation and prediction exports
diff = prediction_df.merge(annotation_df, on=["image", "source_image"], how="outer", indicator=True)

prediction_diff = diff[diff._merge == "left_only"].copy()
prediction_diff.index = range(len(prediction_diff))
prediction_diff["bbox"] = prediction_diff.image.apply(lambda x: get_bbox(Path(x).stem))
prediction_diff = prediction_diff.rename(columns={"transcription_x": "transcription"})


annotation_diff = diff[diff._merge == "right_only"].copy()
annotation_diff.index = range(len(annotation_diff))
annotation_diff["bbox"] = annotation_diff.image.apply(lambda x: get_bbox(Path(x).stem))
annotation_diff = annotation_diff.rename(columns={"transcription_y": "transcription"})


# Match up lines with different bboxes using maximum area overlap
prediction_diff["annotation_image"] = [""] * len(prediction_diff)

for source_image, df_ in prediction_diff.groupby(
    "source_image",
):
    annotation_df_ = annotation_diff[annotation_diff.source_image == source_image]
    annotation_df_.index = range(len(annotation_df_))
    corresponding_annotation_images = [
        find_image_with_biggest_bbox_overlap(bbox=bbox, other_df=annotation_df_)
        for bbox in df_.bbox
    ]
    prediction_diff.loc[df_.index, "annotation_image"] = corresponding_annotation_images


aligned_df = prediction_diff.merge(
    annotation_diff,
    left_on=["annotation_image", "source_image"],
    right_on=["image", "source_image"],
    suffixes=("_pred", "_gt"),
)

assert len(aligned_df) == len(prediction_diff) == len(annotation_diff)

In [5]:
aligned_df[["bbox_pred", "bbox_gt", "source_image"]].to_csv(
    "../../data/different_bbox_text.csv", index=False
)

In [6]:
from PIL import Image

# Show bbox differences
for tup in aligned_df.itertuples():
    annotation = tup.transcription_gt
    model_prediction = tup.transcription_pred

    annotation_bbox = tup.bbox_gt
    prediction_bbox = tup.bbox_pred

    print(f"Annotation: {annotation}\nModel prediction: {model_prediction}")
    print(f"\nAnnotation bbox:      {annotation_bbox}\nPrediction file bbox: {prediction_bbox}")

    # img = Image.open(annotation_export / str(tup.source_image))
    # img_gt = img.crop(annotation_bbox)
    # img_pred = img.crop(prediction_bbox)

    # display(img_gt)
    # display(img_pred)

    print("-----\n")

Annotation: ulmuin koijâdiđ, kost taid finnee.
Model prediction: ulmuin koijâdiđ, kost taid finnee.

Annotation bbox:      (636, 1550, 1793, 1653)
Prediction file bbox: (636, 1550, 1793, 1643)
-----

Annotation: lam pääihist. Nube peln luovtâ, maađijpiällást, uáinojii
Model prediction: lam pääihist. Nube peln luovtâ, maađijpiällást, uáinojii

Annotation bbox:      (629, 2179, 2515, 2298)
Prediction file bbox: (629, 2181, 2515, 2280)
-----

Annotation: kuobžâenni njurgádij. Lasâlijne oinui liihâstmin já forgâ
Model prediction: kuobžâenni njurgádij. Lasâlijne oinui liihâstmin já forgâ

Annotation bbox:      (629, 2712, 2546, 2820)
Prediction file bbox: (629, 2717, 2546, 2820)
-----

Annotation: iiđij Sammlii Issá porthái oolâ.
Model prediction: iiđij Sammlii Issá porthái oolâ.

Annotation bbox:      (621, 2824, 1676, 2919)
Prediction file bbox: (621, 2834, 1676, 2919)
-----

Annotation: finniiččim ruuđâ, pesâččim mäksiđ taam elleituáhtár
Model prediction: finniiččim ruuđâ, pesâččim mäksi