In [1]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd
from pandas.io.formats.style import Styler


def round_and_percentage(num: float) -> float:
    return round(num * 100, 2)


model_prefix_name_map = {"transk": "Transkribus", "tess": "Tesseract", "trocr": "TrOCR"}

## Performance table

In [2]:
best_tesseract_model = "../../output/giellatekno_nor_sme_evaluation-fixed/line_level/tess_sb_smi_nor_pred"
best_trocr_model = "../../output/giellatekno_nor_sme_evaluation-fixed/line_level/trocr_smi_pred_synth"
best_transkribus_model = (
    "../../output/giellatekno_nor_sme_evaluation-fixed/line_level/transk_smi_nor_pred"
)

scores_in_order = ["CER", "WER", "Sámi letter F1"]
df_data = {}
for model_p in [best_transkribus_model, best_tesseract_model, best_trocr_model]:
    model_p = Path(model_p)
    if not model_p.exists():
        print(model_p)
        continue

    model_prefix, _, _ = model_p.name.partition("_")
    model_scores = defaultdict(defaultdict)

    lang_file = model_p / "all_rows.json"
    lang = lang_file.stem.split("_")[0]
    scores = json.loads(lang_file.read_text(encoding="utf-8"))
    model_scores["WER"] = round_and_percentage(scores["WER_concat"])
    model_scores["CER"] = round_and_percentage(scores["CER_concat"])
    model_scores["Sámi letter F1"] = round_and_percentage(
        scores["special_char_F1_mean"]
    )

    df_data[model_prefix_name_map[model_prefix]] = [
        model_scores[score]
        for score in scores_in_order
    ]

In [3]:
df = pd.DataFrame(df_data, index=scores_in_order)
df

Unnamed: 0,Transkribus,Tesseract,TrOCR
CER,0.7,0.12,0.43
WER,5.85,1.02,3.31
Sámi letter F1,100.0,100.0,98.33


In [4]:
styler = Styler(df).format("{:.2f}")

min_cer = df.loc["CER"].min(axis=None)
max_cer = df.loc["CER"].max(axis=None)
min_wer = df.loc["WER"].min(axis=None)
max_wer = df.loc["WER"].max(axis=None)
min_f1 = df.loc["Sámi letter F1"].min(axis=None)
max_f1 = df.loc["Sámi letter F1"].max(axis=None)

styler.background_gradient(
    cmap="Greens_r", vmin=min_cer, vmax=max_cer, subset=pd.IndexSlice["CER", :]
)
styler.background_gradient(
    cmap="Greens_r", vmin=min_wer, vmax=max_wer, subset=pd.IndexSlice["WER", :]
)
styler.background_gradient(
    cmap="Greens", vmin=min_f1, vmax=max_f1, subset=pd.IndexSlice["Sámi letter F1", :]
)
styler.format_index(
    {
        "CER": r"CER \(\downarrow\) [\(\%\)]",
        "WER": r"WER \(\downarrow\) [\(\%\)]",
        "Sámi letter F1": r"Sámi letter F1 \(\uparrow\) [\(\%\)]",
    }.__getitem__,
    level=0,
)
styler.format_index(
    {
        "all": "Overall",
        "sma": "South",
        "sme": "North",
        "smj": "Lule",
        "smn": "Inari",
    }.__getitem__,
    level=1,
)

print(
    styler.to_latex(
        convert_css=True, multirow_align="t", hrules=True, clines="skip-last;data"
    ).replace(
        r"\cline{1-4}", r"\midrule", 2
    )  # Repalce cline with midrule since midrule will add some extra whitespace
)

\begin{tabular}{lrrr}
\toprule
 & Transkribus & Tesseract & TrOCR \\
\midrule
CER \(\downarrow\) [\(\%\)] & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 0.70 & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 0.12 & {\cellcolor[HTML]{80CA80}} \color[HTML]{000000} 0.43 \\
WER \(\downarrow\) [\(\%\)] & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 5.85 & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 1.02 & {\cellcolor[HTML]{6ABF71}} \color[HTML]{000000} 3.31 \\
Sámi letter F1 \(\uparrow\) [\(\%\)] & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 100.00 & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 100.00 & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 98.33 \\
\bottomrule
\end{tabular}



## Error table

In [5]:
df_data = {}

for model_p in [best_transkribus_model, best_tesseract_model, best_trocr_model]:
    model_p = Path(model_p)
    if not model_p.exists():
        print(model_p)
        continue
    model_prefix, _, _ = model_p.name.partition("_")

    model_scores = defaultdict(defaultdict)

    mistakes = json.loads((model_p / "all_rows.json").read_text(encoding="utf-8"))["mistakes"]
    mistakes = sorted(mistakes, key=lambda x: (-x[1], x[0][1], x[0][0]))

    n_max = 15
    padding = [((pd.NA, pd.NA), pd.NA)]*n_max
    mistakes = mistakes + padding
    #df_data[model_prefix_name_map[model_prefix], "Error"] = [f"{replacement!r} -> {substring!r}" for (substring, replacement), _count in mistakes[:10]]
    df_data[model_prefix_name_map[model_prefix], "Error", 0] = [repr(replacement) for (_, replacement), _ in mistakes[:n_max]]
    df_data[model_prefix_name_map[model_prefix], "Error", 1] = ["->" for _, _ in mistakes[:n_max]]
    df_data[model_prefix_name_map[model_prefix], "Error", 2] = [repr(substring) for (substring, _), _ in mistakes[:n_max]]
    df_data[model_prefix_name_map[model_prefix], "#", 3] = [count for _, count in mistakes[:n_max]]

In [6]:
print(
    Styler(pd.DataFrame(df_data).head(10))
    .hide(axis="index")
    .to_latex(multicol_align="c", hrules=True, column_format="@{}c@{}c@{}ccc@{}c@{}ccc@{}c@{}cc@{}")
    # Remove second multiindex level
    .replace(r"0 & 1 & 2 & 3 & 0 & 1 & 2 & 3 & 0 & 1 & 2 & 3 \\", "")
    .replace("\n\n", "\n")
    # Convert arrows to LaTeX
    .replace('->', r'\(\rightarrow\)')
    # Escape hashes
    .replace("#", r"\#")
    # Correct quotes
    .replace("\n'", "\n`")
    .replace("& '", "& `")
    # Setup midrules
    .replace(r"\midrule", r"\cmidrule(r){1-3}\cmidrule(lr){4-4}\cmidrule(lr){5-7}\cmidrule(lr){8-8}\cmidrule(lr){9-11}\cmidrule(l){12-12}")
    .replace(r"\\" + "\n" + r"\multicolumn", r"\\" + "\n" + r"\cmidrule(r){1-4}\cmidrule(lr){5-8}\cmidrule(l){9-12}" + "\n" + r"\multicolumn")    
    # Bold model names
    .replace("Transkribus", r"\textbf{Transkribus}")
    .replace("Tesseract", r"\textbf{Tesseract}")
    .replace("TrOCR", r"\textbf{TrOCR}")
    # Remove NA
    .replace(r"<NA> & \(\rightarrow\) & <NA>", " &  & ")
    .replace("<NA>", "")
)

\begin{tabular}{@{}c@{}c@{}ccc@{}c@{}ccc@{}c@{}cc@{}}
\toprule
\multicolumn{4}{c}{\textbf{Transkribus}} & \multicolumn{4}{c}{\textbf{Tesseract}} & \multicolumn{4}{c}{\textbf{TrOCR}} \\
\cmidrule(r){1-4}\cmidrule(lr){5-8}\cmidrule(l){9-12}
\multicolumn{3}{c}{Error} & \# & \multicolumn{3}{c}{Error} & \# & \multicolumn{3}{c}{Error} & \# \\
\cmidrule(r){1-3}\cmidrule(lr){4-4}\cmidrule(lr){5-7}\cmidrule(lr){8-8}\cmidrule(lr){9-11}\cmidrule(l){12-12}
`' & \(\rightarrow\) & `.' & 12 & `n' & \(\rightarrow\) & `m' & 1 & `ü' & \(\rightarrow\) & `ï' & 2 \\
`ø' & \(\rightarrow\) & `e' & 4 & `t' & \(\rightarrow\) & `f' & 1 & `' & \(\rightarrow\) & `,' & 1 \\
`' & \(\rightarrow\) & `,' & 2 & `ü' & \(\rightarrow\) & `i' & 1 & `-' & \(\rightarrow\) & `–' & 1 \\
`ü' & \(\rightarrow\) & `u' & 2 & `ü' & \(\rightarrow\) & `u' & 1 & `=' & \(\rightarrow\) & `2' & 1 \\
`' & \(\rightarrow\) & `k' & 1 &  &  &  &  & `c' & \(\rightarrow\) & `e' & 1 \\
`c' & \(\rightarrow\) & `' & 1 &  &  &  &  & `l' & \(\rightar

In [7]:
pd.DataFrame(df_data)

Unnamed: 0_level_0,Transkribus,Transkribus,Transkribus,Transkribus,Tesseract,Tesseract,Tesseract,Tesseract,TrOCR,TrOCR,TrOCR,TrOCR
Unnamed: 0_level_1,Error,Error,Error,#,Error,Error,Error,#,Error,Error,Error,#
Unnamed: 0_level_2,0,1,2,3,0,1,2,3,0,1,2,3
0,'',->,'.',12.0,'n',->,'m',1.0,'ü',->,'ï',2.0
1,'ø',->,'e',4.0,'t',->,'f',1.0,'',->,"','",1.0
2,'',->,"','",2.0,'ü',->,'i',1.0,'-',->,'–',1.0
3,'ü',->,'u',2.0,'ü',->,'u',1.0,'=',->,'2',1.0
4,'',->,'k',1.0,,->,,,'c',->,'e',1.0
5,'c',->,'',1.0,,->,,,'l',->,'',1.0
6,'ø',->,'o',1.0,,->,,,'m',->,'n',1.0
7,,->,,,,->,,,'o',->,'n',1.0
8,,->,,,,->,,,'t',->,'l',1.0
9,,->,,,,->,,,'te',->,'s',1.0


## Diagnostics plot of transcriptions

In [8]:
from IPython.display import display, Markdown, HTML
from stringalign.align import align_strings, aggregate_alignment, Keep

def make_img_tag(name: str) -> str:
    img_url = f"https://raw.githubusercontent.com/divvungiellatekno/tesstrain/refs/heads/main/training-data/nor_sme-ground-truth/{name}"
    title = df.query("file_name == @name")["ground_truth"].item()
    return f'<img src="{img_url}" style="height: 20px;" title="{title}"/>'


for name, model_p in {
    "Trans": best_transkribus_model,
    "Tess": best_tesseract_model,
    "TrOCR": best_trocr_model
}.items():
    display(Markdown(f"### {name}"))
    df = (
        pd.read_csv(Path(model_p) / "row_level.csv")
        .query("ground_truth != transcription")
        [["ground_truth", "transcription", "file_name"]]
    )
    df = df.assign(
        errors=df.apply(
            lambda row: [
                op
                for op in aggregate_alignment(align_strings(row["ground_truth"], row["transcription"])[0])
                if not isinstance(op, Keep)
            ],
            axis=1
        ),
        image=df["file_name"]
    )
    
    display(
        HTML(
            df[["image", "transcription", "errors", "file_name"]].to_html(
                formatters={"image": make_img_tag},
                escape=False,
                index=False,
            )
        )
    )

### Trans

image,transcription,errors,file_name
,erret isolere.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-13-43.png
,"(olmmoš) i godt humer, optimis-","[Replace(substring='e', replacement='ø')]",algosatnegirji-067-03.png
,"mátkki, mátkkit en (ei) reise",[Delete(substring='k')],algosatnegirji-064-64.png
,"sávdnji, sávnnjit en som","[Replace(substring='o', replacement='ø')]",algosatnegirji-084-48.png
,sentrifugabohcci sentrifugerer,"[Replace(substring='e', replacement='ø')]",fysihkka-ja-kemiijatearpmat-sme-nob-38-03.png
,kaliumihca (-ciza) kalilut,[Insert(substring='c')],fysihkka-ja-kemiijatearpmat-sme-nob-23-02.png
,enn far).,[Delete(substring='.')],algosatnegirji-030-12.png
,jargŋi kolbe.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-21-30.png
,joule joule.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-22-36.png
,tio tio.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-42-16.png


### Tess

image,transcription,errors,file_name
,gealddahuhttit (frans.) utlade,"[Replace(substring='f', replacement='t')]",fysihkka-ja-kemiijatearpmat-sme-nob-16-27.png
,álgoviđaruovdi råjerm,"[Replace(substring='m', replacement='n')]",fysihkka-ja-kemiijatearpmat-sme-nob-03-29.png
,bichnerkreakta buchnertrakt,"[Replace(substring='i', replacement='ü'), Replace(substring='u', replacement='ü')]",fysihkka-ja-kemiijatearpmat-sme-nob-07-29.png


### TrOCR

image,transcription,errors,file_name
,radioaktivitehta radioaktivist,"[Replace(substring='s', replacement='te')]",fysihkka-ja-kemiijatearpmat-sme-nob-35-10.png
,ionenárja inniseringsenergi,"[Replace(substring='n', replacement='o')]",fysihkka-ja-kemiijatearpmat-sme-nob-21-21.png
,njalbi (njabbi) væske,[Insert(substring='l')],fysihkka-ja-kemiijatearpmat-sme-nob-30-39.png
,cihea (ciza) lut,"[Replace(substring='e', replacement='c')]",fysihkka-ja-kemiijatearpmat-sme-nob-07-36.png
,Vigreuskolonna Vigreuxkolonne,"[Replace(substring='s', replacement='x')]",fysihkka-ja-kemiijatearpmat-sme-nob-44-01.png
,doantá pokker (2 neavri),"[Replace(substring='2', replacement='=')]",algosatnegirji-026-54.png
,iešliekkasnákca spesifikk varnekapasitet,"[Replace(substring='n', replacement='m')]",fysihkka-ja-kemiijatearpmat-sme-nob-20-32.png
,"futteral, et hylster,","[Delete(substring=',')]",algosatnegirji-087-14.png
,dahkkon radioaktivitehta kunstig radioaktivilet,"[Replace(substring='l', replacement='t')]",fysihkka-ja-kemiijatearpmat-sme-nob-09-06.png
,"njažgát, (dat) njaŋgájit å ligge","[Replace(substring='ž', replacement='ŋ')]",algosatnegirji-069-63.png
