In [2]:
import json
from pathlib import Path

import pandas as pd
from pandas.io.formats.style import Styler


def round_and_percentage(num: float) -> float:
    return round(num * 100, 2)


model_prefix_name_map = {"transk": "Transkribus", "tess": "Tesseract", "trocr": "TrOCR", "baseline": "Baseline"}

In [3]:
index = pd.MultiIndex.from_product(
    [["CER", "WER", "Sámi letter F1"], ["overall", "sma", "sme", "smj", "smn"]]
)

In [4]:
!ls ../../output/testset_evaluation/line_level

baseline	 tess_sb_smi_nor_pred  trocr_smi_pred_synth
sb_smi_nor_pred  transk_smi_nor_pred


In [7]:
from collections import defaultdict

dataset = "baseline_evaluation"  # "baseline_evaluation"  # or "testset_evaluation" to not use NB-boxes
best_tesseract_model = f"../../output/{dataset}/line_level/tess_sb_smi_nor_pred"
best_trocr_model = f"../../output/{dataset}/line_level/trocr_smi_pred_synth"
best_transkribus_model = (
    f"../../output/{dataset}/line_level/transk_smi_nor_pred"
)
baseline = (
    f"../../output/{dataset}/line_level/baseline"
)

langs_in_order = ["all", "sma", "sme", "smj", "smn"]
scores_in_order = ["CER", "WER", "Sámi letter F1"]
index = pd.MultiIndex.from_product([scores_in_order, langs_in_order])

df_data = {}

for model_p in [best_transkribus_model, best_tesseract_model, best_trocr_model, baseline]:
    model_p = Path(model_p)
    if not model_p.exists():
        print(model_p)
        continue
    model_prefix, _, _ = model_p.name.partition("_")

    model_scores = defaultdict(defaultdict)

    for lang_file in model_p.glob("*rows.json"):
        lang = lang_file.stem.split("_")[0]
        scores = json.loads(lang_file.read_text(encoding="utf-8"))
        model_scores["WER"][lang] = round_and_percentage(scores["WER_concat"])
        model_scores["CER"][lang] = round_and_percentage(scores["CER_concat"])
        model_scores["Sámi letter F1"][lang] = round_and_percentage(
            scores["special_char_F1_mean"]
        )

    df_data[model_prefix_name_map[model_prefix]] = [
        model_scores[score][lang]
        for score in scores_in_order
        for lang in langs_in_order
    ]

In [8]:
df = pd.DataFrame(df_data, index=index)
df

Unnamed: 0,Unnamed: 1,Transkribus,Tesseract,TrOCR,Baseline
CER,all,0.61,0.89,0.74,3.38
CER,sma,0.33,1.09,0.33,2.05
CER,sme,0.53,0.73,1.2,3.99
CER,smj,0.34,0.26,0.66,2.46
CER,smn,1.22,1.43,0.43,4.36
WER,all,3.19,4.65,2.96,18.71
WER,sma,2.42,7.45,2.33,15.98
WER,sme,1.66,2.9,3.41,20.08
WER,smj,3.27,1.84,3.47,13.27
WER,smn,6.18,7.13,2.4,22.62


In [9]:
df

Unnamed: 0,Unnamed: 1,Transkribus,Tesseract,TrOCR,Baseline
CER,all,0.61,0.89,0.74,3.38
CER,sma,0.33,1.09,0.33,2.05
CER,sme,0.53,0.73,1.2,3.99
CER,smj,0.34,0.26,0.66,2.46
CER,smn,1.22,1.43,0.43,4.36
WER,all,3.19,4.65,2.96,18.71
WER,sma,2.42,7.45,2.33,15.98
WER,sme,1.66,2.9,3.41,20.08
WER,smj,3.27,1.84,3.47,13.27
WER,smn,6.18,7.13,2.4,22.62


In [12]:
styler = Styler(df).format("{:.2f}")

min_cer = df.loc["CER", ["Transkribus", "Tesseract", "TrOCR"]].min(axis=None)
max_cer = df.loc["CER", ["Transkribus", "Tesseract", "TrOCR"]].max(axis=None)
min_wer = df.loc["WER", ["Transkribus", "Tesseract", "TrOCR"]].min(axis=None)
max_wer = df.loc["WER", ["Transkribus", "Tesseract", "TrOCR"]].max(axis=None)
min_f1 = df.loc["Sámi letter F1", ["Transkribus", "Tesseract", "TrOCR"]].min(axis=None)
max_f1 = df.loc["Sámi letter F1", ["Transkribus", "Tesseract", "TrOCR"]].max(axis=None)

styler.background_gradient(
    cmap="Greens_r", vmin=min_cer, vmax=max_cer, subset=pd.IndexSlice[("CER", slice(None)), ["Transkribus", "Tesseract", "TrOCR"]]
)
styler.background_gradient(
    cmap="Greens_r", vmin=min_wer, vmax=max_wer, subset=pd.IndexSlice[("WER", slice(None)), ["Transkribus", "Tesseract", "TrOCR"]]
)
styler.background_gradient(
    cmap="Greens", vmin=min_f1, vmax=max_f1, subset=pd.IndexSlice[("Sámi letter F1", slice(None)), ["Transkribus", "Tesseract", "TrOCR"]]
)
styler.format_index(
    {
        "CER": r"CER \(\downarrow\) [\(\%\)]",
        "WER": r"WER \(\downarrow\) [\(\%\)]",
        "Sámi letter F1": r"Sámi letter F1 \(\uparrow\) [\(\%\)]",
    }.__getitem__,
    level=0,
)
styler.format_index(
    {
        "all": "Overall",
        "sma": "South",
        "sme": "North",
        "smj": "Lule",
        "smn": "Inari",
    }.__getitem__,
    level=1,
)

print(
    styler.to_latex(
        convert_css=True, multirow_align="t", hrules=True, clines="skip-last;data"
    ).replace(
        r"\cline{1-6}", r"\midrule", 2
    )  # Repalce cline with midrule since midrule will add some extra whitespace
)

\begin{tabular}{llrrrr}
\toprule
 &  & Transkribus & Tesseract & TrOCR & Baseline \\
\midrule
\multirow[t]{5}{*}{CER \(\downarrow\) [\(\%\)]} & Overall & {\cellcolor[HTML]{2F974E}} \color[HTML]{F1F1F1} 0.61 & {\cellcolor[HTML]{81CA81}} \color[HTML]{000000} 0.89 & {\cellcolor[HTML]{50B264}} \color[HTML]{F1F1F1} 0.74 & 3.38 \\
 & South & {\cellcolor[HTML]{005723}} \color[HTML]{F1F1F1} 0.33 & {\cellcolor[HTML]{BBE4B4}} \color[HTML]{000000} 1.09 & {\cellcolor[HTML]{005723}} \color[HTML]{F1F1F1} 0.33 & 2.05 \\
 & North & {\cellcolor[HTML]{1E8741}} \color[HTML]{F1F1F1} 0.53 & {\cellcolor[HTML]{4BB062}} \color[HTML]{F1F1F1} 0.73 & {\cellcolor[HTML]{D4EECE}} \color[HTML]{000000} 1.20 & 3.99 \\
 & Lule & {\cellcolor[HTML]{005A24}} \color[HTML]{F1F1F1} 0.34 & {\cellcolor[HTML]{00441B}} \color[HTML]{F1F1F1} 0.26 & {\cellcolor[HTML]{39A257}} \color[HTML]{F1F1F1} 0.66 & 2.46 \\
 & Inari & {\cellcolor[HTML]{D9F0D3}} \color[HTML]{000000} 1.22 & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 1.43 & {