In [8]:
import json
from pathlib import Path

import pandas as pd


def round_and_percentage(num: float) -> float:
    return round(num * 100, 2)


columns = ["Transkribus med språkmodell", "Transkribus uten språkmodell"]
dataset_map = {
    "ub_smi": "GT-Sámi (without base)",
    "smi_ub": "GT-Sámi (without base)",
    "smi": "GT-Sámi",
    "smi_nor": "GT-Sámi + GT-Nor",
    "smi_pred": "GT-Sámi + Pred-Sámi",
    "smi_nor_pred": "GT-Sámi + GT-Nor + Pred-Sámi",
}

ordered_dataset_values = [
    "GT-Sámi (without base)",
    "GT-Sámi",
    "GT-Sámi + GT-Nor",
    "GT-Sámi + Pred-Sámi",
    "GT-Sámi + GT-Nor + Pred-Sámi",
]

cer_df = pd.DataFrame(
    {
        "dataset": ordered_dataset_values,
        **{e: [""] * len(ordered_dataset_values) for e in columns},
    }
)
cer_df = cer_df.set_index("dataset")
wer_df = cer_df.copy()

output_dir = "../../output/evaluation/line_level"
output_dir = Path(output_dir)

for model_name in output_dir.glob("transk*"):
    model_prefix, _, model_info = model_name.name.partition("_")

    if "lm" in model_info:
        col = columns[0]
        model_info, _, _ = model_info.rpartition("_")
    else:
        col = columns[1]

    eval_data_file = model_name / "all_rows.json"
    eval_data = json.loads(eval_data_file.read_text())

    cer_df.at[dataset_map[model_info], col] = round_and_percentage(eval_data["CER_concat"])
    wer_df.at[dataset_map[model_info], col] = round_and_percentage(eval_data["WER_concat"])

In [2]:
cer_df

Unnamed: 0_level_0,Transkribus med språkmodell,Transkribus uten språkmodell
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
GT-Sámi (without base),1.59,2.0
GT-Sámi,1.28,1.47
GT-Sámi + GT-Nor,1.31,1.45
GT-Sámi + Pred-Sámi,1.48,1.61
GT-Sámi + GT-Nor + Pred-Sámi,1.07,1.19


In [3]:
wer_df

Unnamed: 0_level_0,Transkribus med språkmodell,Transkribus uten språkmodell
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
GT-Sámi (without base),5.67,8.11
GT-Sámi,4.34,6.33
GT-Sámi + GT-Nor,4.35,5.9
GT-Sámi + Pred-Sámi,4.02,5.03
GT-Sámi + GT-Nor + Pred-Sámi,3.58,4.47
