In [1]:
import json
from pathlib import Path

import pandas as pd


def round_and_percentage(num: float) -> float:
    return round(num * 100, 2)


model_prefix_name_map = {"transk": "Transkribus", "tess": "Tesseract", "trocr": "TrOCR"}
dataset_map = {
    "ub_smi": "GT-Sámi (without base)",
    "smi_synth": "GT-Sámi (synth base)",
    "sb_smi": "GT-Sámi (synth base)",
    "smi": "GT-Sámi",
    "smi_nor": "GT-Sámi + GT-Nor",
    "smi_pred": "GT-Sámi + Pred-Sámi",
    "smi_nor_pred": "GT-Sámi + GT-Nor + Pred-Sámi",
    "sb_smi_nor_pred": "GT-Sámi + GT-Nor + Pred-Sámi (synth base)",
}

ordered_dataset_values = [
    "GT-Sámi (without base)",
    "GT-Sámi",
    "GT-Sámi + GT-Nor",
    "GT-Sámi + Pred-Sámi",
    "GT-Sámi + GT-Nor + Pred-Sámi",
    "GT-Sámi (synth base)",
    "GT-Sámi + GT-Nor + Pred-Sámi (synth base)",
]

cer_df = pd.DataFrame(
    {
        "dataset": ordered_dataset_values,
        **{e: [""] * len(ordered_dataset_values) for e in model_prefix_name_map.values()},
    }
)
cer_df = cer_df.set_index("dataset")
wer_df = cer_df.copy()

output_dir = "../output/evaluation/line_level"
output_dir = Path(output_dir)

for model_name in output_dir.iterdir():
    if "smi" not in model_name.name:
        continue
    model_prefix, _, model_info = model_name.name.partition("_")

    eval_data_file = model_name / "all_rows.json"
    eval_data = json.loads(eval_data_file.read_text())

    cer_df.at[dataset_map[model_info], model_prefix_name_map[model_prefix]] = round_and_percentage(
        eval_data["CER_concat"]
    )
    wer_df.at[dataset_map[model_info], model_prefix_name_map[model_prefix]] = round_and_percentage(
        eval_data["WER_concat"]
    )

In [2]:
cer_df

Unnamed: 0_level_0,Transkribus,Tesseract,TrOCR
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GT-Sámi (without base),2.0,7.93,
GT-Sámi,1.47,4.59,1.98
GT-Sámi + GT-Nor,1.45,4.91,1.95
GT-Sámi + Pred-Sámi,1.61,4.42,1.28
GT-Sámi + GT-Nor + Pred-Sámi,1.19,4.4,1.32
GT-Sámi (synth base),,4.33,1.15
GT-Sámi + GT-Nor + Pred-Sámi (synth base),,4.36,


In [3]:
wer_df

Unnamed: 0_level_0,Transkribus,Tesseract,TrOCR
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GT-Sámi (without base),8.11,24.7,
GT-Sámi,6.33,9.84,9.29
GT-Sámi + GT-Nor,5.9,11.39,8.88
GT-Sámi + Pred-Sámi,5.03,8.17,5.0
GT-Sámi + GT-Nor + Pred-Sámi,4.47,7.96,5.14
GT-Sámi (synth base),,8.78,5.04
GT-Sámi + GT-Nor + Pred-Sámi (synth base),,7.7,


In [4]:
def multiline_cell(s: str) -> str:
    template_start = r"\begin{tabular}[c]{@{}l@{}}"
    template_end = r"\end{tabular}"
    return template_start + s + template_end


def new_name(s: str, table_cell: bool = True) -> str:
    s_pre = s
    if "(" in s:
        s = s.replace("(", r"\\(")
    if "+" in s:
        s = s.replace(" + ", r"\\+")
    if table_cell and s_pre != s:
        return multiline_cell(s)
    return s


def df_to_latex_df(df: pd.DataFrame, table_cell: bool) -> pd.DataFrame:
    df = df.copy()
    df.index = [new_name(e, table_cell=table_cell) for e in df.index]
    return df


def add_hline(latex_code: str) -> str:
    m_i = latex_code.index("\\midrule\n")
    b_i = latex_code.index("\n\\bottomrule")
    mid = latex_code[m_i + len("\\midrule\n") : b_i]
    mid = "\\hline\n".join(mid.split("\n"))
    latex_code = latex_code[: m_i + len("\\midrule\n")] + mid + latex_code[b_i:]
    return latex_code

In [5]:
def print_latex_table(df: pd.DataFrame):
    latex_df = df_to_latex_df(df, table_cell=False)
    latex_code = latex_df.to_latex(
        float_format="%.2f",
    )
    print(add_hline(latex_code=latex_code))

In [6]:
print_latex_table(cer_df)

\begin{tabular}{llll}
\toprule
 & Transkribus & Tesseract & TrOCR \\
\midrule
GT-Sámi \\(without base) & 2.00 & 7.93 &  \\\hline
GT-Sámi & 1.47 & 4.59 & 1.98 \\\hline
GT-Sámi\\+GT-Nor & 1.45 & 4.91 & 1.95 \\\hline
GT-Sámi\\+Pred-Sámi & 1.61 & 4.42 & 1.28 \\\hline
GT-Sámi\\+GT-Nor\\+Pred-Sámi & 1.19 & 4.40 & 1.32 \\\hline
GT-Sámi \\(synth base) &  & 4.33 & 1.15 \\\hline
GT-Sámi\\+GT-Nor\\+Pred-Sámi \\(synth base) &  & 4.36 &  \\
\bottomrule
\end{tabular}



In [7]:
print_latex_table(wer_df)

\begin{tabular}{llll}
\toprule
 & Transkribus & Tesseract & TrOCR \\
\midrule
GT-Sámi \\(without base) & 8.11 & 24.70 &  \\\hline
GT-Sámi & 6.33 & 9.84 & 9.29 \\\hline
GT-Sámi\\+GT-Nor & 5.90 & 11.39 & 8.88 \\\hline
GT-Sámi\\+Pred-Sámi & 5.03 & 8.17 & 5.00 \\\hline
GT-Sámi\\+GT-Nor\\+Pred-Sámi & 4.47 & 7.96 & 5.14 \\\hline
GT-Sámi \\(synth base) &  & 8.78 & 5.04 \\\hline
GT-Sámi\\+GT-Nor\\+Pred-Sámi \\(synth base) &  & 7.70 &  \\
\bottomrule
\end{tabular}

