## Imports and setup

In [1]:
import json
from pathlib import Path

import pandas as pd
from pandas.io.formats.style import Styler


def round_and_percentage(num: float) -> float:
    return round(num * 100, 2)


model_prefix_name_map = {"transk": "Transkribus", "tess": "Tesseract", "trocr": "TrOCR"}

## Performance table

In [2]:
from collections import defaultdict

best_tesseract_model = "../../output/giellatekno_nor_sme_evaluation-fixed/line_level/tess_sb_smi_nor_pred"
best_trocr_model = "../../output/giellatekno_nor_sme_evaluation-fixed/line_level/trocr_smi_pred_synth"
best_transkribus_model = (
    "../../output/giellatekno_nor_sme_evaluation-fixed/line_level/transk_smi_nor_pred"
)


scores_in_order = ["CER", "WER", "F1"]

df_data = {}

for model_p in [best_transkribus_model, best_tesseract_model, best_trocr_model]:
    model_p = Path(model_p)
    if not model_p.exists():
        print(model_p)
        continue
    model_prefix, _, _ = model_p.name.partition("_")

    model_scores = defaultdict(defaultdict)

    lang_file = model_p / "all_rows.json"
    lang = lang_file.stem.split("_")[0]
    scores = json.loads(lang_file.read_text(encoding="utf-8"))
    model_scores["WER"] = round_and_percentage(scores["WER_concat"])
    model_scores["CER"] = round_and_percentage(scores["CER_concat"])
    model_scores["F1"] = round_and_percentage(
        scores["special_char_F1_mean"]
    )

    df_data[model_prefix_name_map[model_prefix]] = [
        model_scores[score]
        for score in scores_in_order
    ]

In [None]:
df = pd.DataFrame(df_data, index=scores_in_order)
df

Unnamed: 0,Transkribus,Tesseract,TrOCR
CER,0.7,0.12,0.43
WER,5.85,1.02,3.31
F1,100.0,100.0,98.33


In [None]:
styler = Styler(df).format("{:.2f}")

min_cer = df.loc["CER"].min(axis=None)
max_cer = df.loc["CER"].max(axis=None)
min_wer = df.loc["WER"].min(axis=None)
max_wer = df.loc["WER"].max(axis=None)
min_f1 = df.loc["F1"].min(axis=None)
max_f1 = df.loc["F1"].max(axis=None)

styler.background_gradient(
    cmap="Greens_r", vmin=min_cer, vmax=max_cer, subset=pd.IndexSlice["CER", :]
)
styler.background_gradient(
    cmap="Greens_r", vmin=min_wer, vmax=max_wer, subset=pd.IndexSlice["WER", :]
)
styler.background_gradient(
    cmap="Greens", vmin=min_f1, vmax=max_f1, subset=pd.IndexSlice["F1", :]
)
styler.format_index(
    {
        "CER": r"CER \(\downarrow\) [\(\%\)]",
        "WER": r"WER \(\downarrow\) [\(\%\)]",
        "F1": r"F1 \(\uparrow\) [\(\%\)]",
    }.__getitem__,
    level=0,
)
styler.format_index(
    {
        "all": "Overall",
        "sma": "South",
        "sme": "North",
        "smj": "Lule",
        "smn": "Inari",
    }.__getitem__,
    level=1,
)

print(
    styler.to_latex(
        convert_css=True, multirow_align="t", hrules=True, clines="skip-last;data"
    ).replace(
        r"\cline{1-4}", r"\midrule", 2
    )  # Repalce cline with midrule since midrule will add some extra whitespace
)

\begin{tabular}{lrrr}
\toprule
 & Transkribus & Tesseract & TrOCR \\
\midrule
CER \(\downarrow\) [\(\%\)] & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 0.70 & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 0.12 & {\cellcolor[HTML]{80CA80}} \color[HTML]{000000} 0.43 \\
WER \(\downarrow\) [\(\%\)] & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 5.85 & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 1.02 & {\cellcolor[HTML]{6ABF71}} \color[HTML]{000000} 3.31 \\
F1 \(\uparrow\) [\(\%\)] & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 100.00 & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 100.00 & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 98.33 \\
\bottomrule
\end{tabular}



## Error table

In [5]:
df_data = {}

for model_p in [best_transkribus_model, best_tesseract_model, best_trocr_model]:
    model_p = Path(model_p)
    if not model_p.exists():
        print(model_p)
        continue
    model_prefix, _, _ = model_p.name.partition("_")

    model_scores = defaultdict(defaultdict)

    evaluation = json.loads((model_p / "all_rows.json").read_text(encoding="utf-8"))
    mistakes = evaluation["mistakes"]

        
    
    true_positives = dict(evaluation["true_positives"])
    false_negatives = dict(evaluation["false_negatives"])
    original_letters = [replacement for (_, replacement), _ in mistakes]
    df_data[model_prefix_name_map[model_prefix]] = {}
    df_data[model_prefix_name_map[model_prefix]]["Error", 0] = [repr(letter) for letter in original_letters]
    df_data[model_prefix_name_map[model_prefix]]["Error", 1] = ["->" for _, _ in mistakes]
    df_data[model_prefix_name_map[model_prefix]]["Error", 2] = [repr(substring) for (substring, _), _ in mistakes]
    df_data[model_prefix_name_map[model_prefix]][r"\(n_e\)", 3] = [count for _, count in mistakes]
    df_data[model_prefix_name_map[model_prefix]][r"\(n_m\)", 4] = [false_negatives.get(c, 0) if c else "--" for c in original_letters]
    df_data[model_prefix_name_map[model_prefix]][r"\(n_c\)", 5] = [(true_positives.get(c, 0) + false_negatives.get(c, 0)) if c else "--" for c in original_letters]

In [6]:
# Since the different models have a different number of mistakes, we first create data frames for each model before we concatenamte them sideways
dfs = []
for k, data in df_data.items():
    # Set up dataframe and sort values
    df = pd.DataFrame(data).sort_values([(r'\(n_e\)', 3), (r'\(n_m\)', 4), (r'\(n_c\)', 5)], ascending=False).head(20)

    # Add the model type column level
    df = df.T.reset_index().assign(model=k).set_index(["model", "level_0", "level_1"]).T.reset_index(drop=True)
    df.columns.names = [None, None, None]

    dfs.append(df)

df = pd.concat(dfs, axis=1)

In [7]:
df

Unnamed: 0_level_0,Transkribus,Transkribus,Transkribus,Transkribus,Transkribus,Transkribus,Tesseract,Tesseract,Tesseract,Tesseract,Tesseract,Tesseract,TrOCR,TrOCR,TrOCR,TrOCR,TrOCR,TrOCR
Unnamed: 0_level_1,Error,Error,Error,\(n_e\),\(n_m\),\(n_c\),Error,Error,Error,\(n_e\),\(n_m\),\(n_c\),Error,Error,Error,\(n_e\),\(n_m\),\(n_c\)
Unnamed: 0_level_2,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5
0,'',->,'.',12.0,--,--,'ü',->,'i',1.0,2.0,2.0,'ü',->,'ï',2,2,2
1,'ø',->,'e',4.0,5,13,'ü',->,'u',1.0,2.0,2.0,'',->,"','",1,--,--
2,'',->,"','",2.0,--,--,'t',->,'f',1.0,1.0,220.0,'t',->,'l',1,2,220
3,'ü',->,'u',2.0,2,2,'n',->,'m',1.0,1.0,164.0,'l',->,'',1,1,169
4,'',->,'k',1.0,--,--,,,,,,,'o',->,'n',1,1,149
5,'ø',->,'o',1.0,5,13,,,,,,,'m',->,'n',1,1,69
6,'c',->,'',1.0,1,23,,,,,,,'c',->,'e',1,1,23
7,,,,,,,,,,,,,'-',->,'–',1,1,18
8,,,,,,,,,,,,,'ŋ',->,'ž',1,1,9
9,,,,,,,,,,,,,'=',->,'2',1,1,4


### Clearing up the true positive and false negative count of `"te"`

The false negative count of `"te"` is set to 0 since Stringalign counts true and false positives for each character. However, we aggregate the edits here since e.g.

`'te' -> 's'`

could be described by either

`'t' -> ''`
`'e' -> 's'`

or

`'t' -> 's'`
`'e' -> ''`

with no way of discerning them. However, we can manually inspect the `"te"` false negative count

In [8]:
line_errors = pd.read_csv("../../output/giellatekno_nor_sme_evaluation-fixed/line_level/trocr_smi_pred_synth/row_level.csv")

In [9]:
te_true_positive = " ".join(line_errors["ground_truth"]).count("te")
te_true_positive

28

In [10]:
te_pred_positive = " ".join(line_errors["transcription"]).count("te")
te_pred_positive

26

It seems like the number of false negatives for `"te"` is. However, there may be more false negatives, but "hallucinated" false positive occurences of `"te"` that mess up the count. To double check it, we display all lines where the number of "t", "e" or "te" differs. We also check where the number of `"l"` differs since there is one deletion of an `"l"` that might result in a false positive (this is the only deletion, as shown in the table above).

In [11]:
for line in line_errors.itertuples():
    if (
        line.ground_truth.count("t") != line.transcription.count("t")
        or line.ground_truth.count("e") != line.transcription.count("e")
        or line.ground_truth.count("te") != line.transcription.count("te")
        or line.ground_truth.count("l") != line.transcription.count("l")
    ):
        print("true", line.ground_truth)
        print("pred", line.transcription)
        print()

true njalbi (njalbbi) væske
pred njalbi (njabbi) væske

true radioaktivitehta radioaktivitet
pred radioaktivitehta radioaktivist

true dahkkon radioaktivitehta kunstig radioaktivitet
pred dahkkon radioaktivitehta kunstig radioaktivilet

true cihca (ciza) lut
pred cihea (ciza) lut



This by itself isn't enough to ensure that there is only two false positives. However, if we combine it with the error counts, we see that there is only one `"t"`-error and one `"e"`-error, which means that we have checked all places where a "hallucinated" `"te"` could occur. Thus, we know that there is only 2 false negatives.

In [12]:
df.loc[11, ("TrOCR", r"\(n_m\)")] = te_true_positive - te_pred_positive
df.loc[11, ("TrOCR", r"\(n_c\)")] = te_true_positive
df["TrOCR"] = df["TrOCR"].sort_values([(r'\(n_e\)', 3), (r'\(n_m\)', 4), (r'\(n_c\)', 5)], ascending=False).reset_index(drop=True)
df

  df.loc[11, ("TrOCR", r"\(n_m\)")] = te_true_positive - te_pred_positive
  df.loc[11, ("TrOCR", r"\(n_c\)")] = te_true_positive


Unnamed: 0_level_0,Transkribus,Transkribus,Transkribus,Transkribus,Transkribus,Transkribus,Tesseract,Tesseract,Tesseract,Tesseract,Tesseract,Tesseract,TrOCR,TrOCR,TrOCR,TrOCR,TrOCR,TrOCR
Unnamed: 0_level_1,Error,Error,Error,\(n_e\),\(n_m\),\(n_c\),Error,Error,Error,\(n_e\),\(n_m\),\(n_c\),Error,Error,Error,\(n_e\),\(n_m\),\(n_c\)
Unnamed: 0_level_2,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5
0,'',->,'.',12.0,--,--,'ü',->,'i',1.0,2.0,2.0,'ü',->,'ï',2,2,2
1,'ø',->,'e',4.0,5,13,'ü',->,'u',1.0,2.0,2.0,'',->,"','",1,--,--
2,'',->,"','",2.0,--,--,'t',->,'f',1.0,1.0,220.0,'t',->,'l',1,2,220
3,'ü',->,'u',2.0,2,2,'n',->,'m',1.0,1.0,164.0,'te',->,'s',1,2,28
4,'',->,'k',1.0,--,--,,,,,,,'l',->,'',1,1,169
5,'ø',->,'o',1.0,5,13,,,,,,,'o',->,'n',1,1,149
6,'c',->,'',1.0,1,23,,,,,,,'m',->,'n',1,1,69
7,,,,,,,,,,,,,'c',->,'e',1,1,23
8,,,,,,,,,,,,,'-',->,'–',1,1,18
9,,,,,,,,,,,,,'ŋ',->,'ž',1,1,9


In [13]:
print(r"\setlength{\tabcolsep}{3.7pt}")
print(r"\setlength{\cmidrulekern}{3.7pt}")
print(
    Styler(df.fillna(""))
    .hide(axis="index")
    .to_latex(multicol_align="c", hrules=True, column_format="@{}c@{}c@{}crrr|c@{}c@{}crrr|c@{}c@{}crrr@{}")
    # Remove second multiindex level
    .replace(r"0 & 1 & 2 & 3 & 4 & 5 & 0 & 1 & 2 & 3 & 4 & 5 & 0 & 1 & 2 & 3 & 4 & 5 \\", "")
    .replace("\n\n", "\n")
    # Convert arrows to LaTeX
    .replace('->', r'\(\shortrightarrow\)')
    # Escape hashes
    .replace("#", r"\#")
    # Correct quotes
    .replace("\n'", "\n`")
    .replace("& '", "& `")
    # Setup midrules
    .replace(
        r"\midrule",
        (
            r"\cmidrule(r){1-3}\cmidrule(lr){4-4}\cmidrule(lr){5-5}\cmidrule(lr){6-6}"
            "\n"
            r"\cmidrule(lr){7-9}\cmidrule(lr){10-10}\cmidrule(lr){11-11}\cmidrule(lr){12-12}"
            "\n"
            r"\cmidrule(lr){13-15}\cmidrule(lr){16-16}\cmidrule(lr){17-17}\cmidrule(l){18-18}"
        )
    )
    .replace(r"\\" + "\n" + r"\multicolumn", r"\\" + "\n" + r"\cmidrule(r){1-6}\cmidrule(lr){7-12}\cmidrule(l){13-18}" + "\n" + r"\multicolumn")    
    # Bold model names
    .replace("Transkribus", r"\textbf{Transkribus}")
    .replace("Tesseract", r"\textbf{Tesseract}")
    .replace("TrOCR", r"\textbf{TrOCR}")
    .replace("Baseline", r"\textbf{Baseline}")
    # Center n_c
    .replace(r"\(n_c\)", r"\multicolumn{1}{c}{\(n_c\)}")
)

\setlength{\tabcolsep}{3.7pt}
\setlength{\cmidrulekern}{3.7pt}
\begin{tabular}{@{}c@{}c@{}crrr|c@{}c@{}crrr|c@{}c@{}crrr@{}}
\toprule
\multicolumn{6}{c}{\textbf{Transkribus}} & \multicolumn{6}{c}{\textbf{Tesseract}} & \multicolumn{6}{c}{\textbf{TrOCR}} \\
\cmidrule(r){1-6}\cmidrule(lr){7-12}\cmidrule(l){13-18}
\multicolumn{3}{c}{Error} & \(n_e\) & \(n_m\) & \multicolumn{1}{c}{\(n_c\)} & \multicolumn{3}{c}{Error} & \(n_e\) & \(n_m\) & \multicolumn{1}{c}{\(n_c\)} & \multicolumn{3}{c}{Error} & \(n_e\) & \(n_m\) & \multicolumn{1}{c}{\(n_c\)} \\
\cmidrule(r){1-3}\cmidrule(lr){4-4}\cmidrule(lr){5-5}\cmidrule(lr){6-6}
\cmidrule(lr){7-9}\cmidrule(lr){10-10}\cmidrule(lr){11-11}\cmidrule(lr){12-12}
\cmidrule(lr){13-15}\cmidrule(lr){16-16}\cmidrule(lr){17-17}\cmidrule(l){18-18}
`' & \(\shortrightarrow\) & `.' & 12 & -- & -- & `ü' & \(\shortrightarrow\) & `i' & 1 & 2 & 2 & `ü' & \(\shortrightarrow\) & `ï' & 2 & 2 & 2 \\
`ø' & \(\shortrightarrow\) & `e' & 4 & 5 & 13 & `ü' & \(\shortrightarrow\) & `

  Styler(df.fillna(""))


## Diagnostics plot of transcriptions

In [14]:
from IPython.display import display, Markdown, HTML
from stringalign.align import align_strings, aggregate_alignment, Keep

def make_img_tag(name: str) -> str:
    img_url = f"https://raw.githubusercontent.com/divvungiellatekno/tesstrain/refs/heads/main/training-data/nor_sme-ground-truth/{name}"
    title = df.query("file_name == @name")["ground_truth"].item()
    return f'<img src="{img_url}" style="height: 20px;" title="{title}"/>'

for name, model_p in {
    "Trans": best_transkribus_model,
    "Tess": best_tesseract_model,
    "TrOCR": best_trocr_model
}.items():
    display(Markdown(f"### {name}"))
    df = (
        pd.read_csv(Path(model_p) / "row_level.csv")
        .query("ground_truth != transcription")
        [["ground_truth", "transcription", "file_name"]]
    )
    df = df.assign(
        errors=df.apply(
            lambda row: [
                op
                for op in aggregate_alignment(align_strings(row["ground_truth"], row["transcription"])[0])
                if not isinstance(op, Keep)
            ],
            axis=1
        ),
        image=df["file_name"]
    )
    
    display(
        HTML(
            df[["image", "transcription", "errors", "file_name"]].to_html(
                formatters={"image": make_img_tag},
                escape=False,
                index=False,
            )
        )
    )

### Trans

image,transcription,errors,file_name
,ieš spesifikk.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-20-26.png
,jargŋi kolbe.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-21-30.png
,sentrifugabohcci sentrifugerer,"[Replace(substring='e', replacement='ø')]",fysihkka-ja-kemiijatearpmat-sme-nob-38-03.png
,"lossodatgieddi tyngdefelt,","[Delete(substring=',')]",fysihkka-ja-kemiijatearpmat-sme-nob-26-26.png
,erret isolere.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-13-43.png
,"rovvi, rovit ei enkel bru (= šaldi).",[Delete(substring='.')],algosatnegirji-081-21.png
,"futteral, et hylster.",[Delete(substring='.')],algosatnegirji-087-14.png
,"(olmmoš) i godt humer, optimis-","[Replace(substring='e', replacement='ø')]",algosatnegirji-067-03.png
,joule joule.,[Delete(substring='.')],fysihkka-ja-kemiijatearpmat-sme-nob-22-36.png
,(= sovkkástallat).,[Delete(substring='.')],algosatnegirji-083-39.png


### Tess

image,transcription,errors,file_name
,álgoviđaruovdi råjerm,"[Replace(substring='m', replacement='n')]",fysihkka-ja-kemiijatearpmat-sme-nob-03-29.png
,gealddahuhttit (frans.) utlade,"[Replace(substring='f', replacement='t')]",fysihkka-ja-kemiijatearpmat-sme-nob-16-27.png
,bichnerkreakta buchnertrakt,"[Replace(substring='i', replacement='ü'), Replace(substring='u', replacement='ü')]",fysihkka-ja-kemiijatearpmat-sme-nob-07-29.png


### TrOCR

image,transcription,errors,file_name
,iešliekkasnákca spesifikk varnekapasitet,"[Replace(substring='n', replacement='m')]",fysihkka-ja-kemiijatearpmat-sme-nob-20-32.png
,ionenárja inniseringsenergi,"[Replace(substring='n', replacement='o')]",fysihkka-ja-kemiijatearpmat-sme-nob-21-21.png
,njalbi (njabbi) væske,[Insert(substring='l')],fysihkka-ja-kemiijatearpmat-sme-nob-30-39.png
,"futteral, et hylster,","[Delete(substring=',')]",algosatnegirji-087-14.png
,radioaktivitehta radioaktivist,"[Replace(substring='s', replacement='te')]",fysihkka-ja-kemiijatearpmat-sme-nob-35-10.png
,Vigreuskolonna Vigreuxkolonne,"[Replace(substring='s', replacement='x')]",fysihkka-ja-kemiijatearpmat-sme-nob-44-01.png
,dahkkon radioaktivitehta kunstig radioaktivilet,"[Replace(substring='l', replacement='t')]",fysihkka-ja-kemiijatearpmat-sme-nob-09-06.png
,"njažgát, (dat) njaŋgájit å ligge","[Replace(substring='ž', replacement='ŋ')]",algosatnegirji-069-63.png
,doantá pokker (2 neavri),"[Replace(substring='2', replacement='=')]",algosatnegirji-026-54.png
,cihea (ciza) lut,"[Replace(substring='e', replacement='c')]",fysihkka-ja-kemiijatearpmat-sme-nob-07-36.png
