# Visualize and print results on test set 

In [10]:
from collections import defaultdict
import pandas as pd 
from pathlib import Path 
from jiwer import wer, cer

pd.options.plotting.backend = "plotly"

def print_results(dfs: list[pd.DataFrame]):
    def to_percent(num):
        return round(num*100, 2)

    for df in dfs:
        df.transcription = df.transcription.apply(lambda x: str(x).strip())
        df["correct_prediction"] = df.transcription == df.ground_truth
        df["wer"] = wer(reference=df.ground_truth.to_list(), hypothesis=df.transcription.to_list())
        df["cer"] = cer(reference=df.ground_truth.to_list(), hypothesis=df.transcription.to_list())

        all_transcriptions = " ".join(df.transcription)
        all_gt = " ".join(df.ground_truth)
        all_wer = wer(hypothesis=all_transcriptions, reference=all_gt)
        all_cer = cer(hypothesis=all_transcriptions, reference=all_gt)
        mean_wer = df.wer.mean()
        mean_cer = df.cer.mean()
        accuracy = len(df[df.correct_prediction])/len(df)
        
        print(f"""{df.model_name[0]}
    Strict accuracy:    {to_percent(accuracy)}%

    Mean WER:           {to_percent(mean_wer)}%
    WER (all concat):   {to_percent(all_wer)}%

    Mean CER:           {to_percent(mean_cer)}%
    CER (all concat):   {to_percent(all_cer)}%\n""")
        

def dfs_to_score_df(dfs: list[pd.DataFrame]) -> pd.DataFrame:
    scores = defaultdict(list)
    for df in dfs:
        df.transcription = df.transcription.apply(lambda x: str(x).strip())
        df["correct_prediction"] = df.transcription == df.ground_truth
        df["wer"] = wer(reference=df.ground_truth.to_list(), hypothesis=df.transcription.to_list())
        df["cer"] = cer(reference=df.ground_truth.to_list(), hypothesis=df.transcription.to_list())

        all_transcriptions = " ".join(df.transcription)
        all_gt = " ".join(df.ground_truth)
        all_wer = wer(hypothesis=all_transcriptions, reference=all_gt)
        all_cer = cer(hypothesis=all_transcriptions, reference=all_gt)
        mean_wer = df.wer.mean()
        mean_cer = df.cer.mean()
        accuracy = len(df[df.correct_prediction])/len(df)

        scores["model"].append(df.model_name[0])
        scores["score"].append("1 - Mean WER")
        scores["value"].append(1-mean_wer)

        scores["model"].append(df.model_name[0])
        scores["score"].append("1 - Mean CER")
        scores["value"].append(1-mean_cer)
        
        scores["model"].append(df.model_name[0])
        scores["score"].append("1 - WER (concat)")
        scores["value"].append(1-all_wer)

        scores["model"].append(df.model_name[0])
        scores["score"].append("1 - CER (concat)")
        scores["value"].append(1-all_cer)

        scores["model"].append(df.model_name[0])
        scores["score"].append("Accuracy")
        scores["value"].append(accuracy)
    return pd.DataFrame(scores)        

In [11]:
### CHANGE THIS ###
results_dirname = "output/"
### ---------- ###

p = Path(results_dirname)
ps = sorted([e for e in p.iterdir() if e.name.startswith("test") and e.name.endswith("gt.csv")])
dfs = [pd.read_csv(e) for e in ps]
print_results(dfs)

est_smx_20000
    Strict accuracy:    44.27%

    Mean WER:           18.1%
    WER (all concat):   18.1%

    Mean CER:           5.47%
    CER (all concat):   5.36%

est
    Strict accuracy:    11.24%

    Mean WER:           47.31%
    WER (all concat):   47.31%

    Mean CER:           13.16%
    CER (all concat):   12.91%

fin_sme
    Strict accuracy:    30.96%

    Mean WER:           29.37%
    WER (all concat):   29.37%

    Mean CER:           12.54%
    CER (all concat):   12.3%

fin
    Strict accuracy:    11.01%

    Mean WER:           49.0%
    WER (all concat):   48.94%

    Mean CER:           13.77%
    CER (all concat):   13.51%

nor_sme
    Strict accuracy:    37.16%

    Mean WER:           28.08%
    WER (all concat):   28.08%

    Mean CER:           10.87%
    CER (all concat):   10.66%

nor_smx2_20000
    Strict accuracy:    59.63%

    Mean WER:           13.24%
    WER (all concat):   13.24%

    Mean CER:           5.17%
    CER (all concat):   5.07%

nor_smx

In [20]:
score_df = dfs_to_score_df(dfs)
score_df = score_df[score_df.score.apply(lambda x: "Mean" not in x)] # uncomment to include mean scores as well
# score_df = score_df[score_df.model.apply(lambda x: "smx" not in x)]  # uncomment to exclude our models
# score_df = score_df[score_df.model.apply(lambda x: len(x.split("_")) >= 3)] # uncomment to exclude base and giellatekno models
# score_df = score_df[score_df.model.apply(lambda x: "sme" not in x)] # uncomment to exclude giellatekno models
# score_df = score_df[score_df.model.apply(lambda x: "fin" not in x)] # uncomment to exclude finnish base model

# score_df = score_df[score_df.model.apply(lambda x: "_" in x or "sme" in x)] # uncomment to exclude base models
# score_df = score_df[score_df.model.apply(lambda x: len(x.split("_")) == 2)] # uncomment to exclude model with bases

# score_df = score_df[score_df.model.apply(lambda x: x.split("_")[-1] in ("25000", "40000") or not x.split("_")[-1].isnumeric())] # uncomment to exclude smaller models

# score_df = score_df[score_df.model.apply(lambda x: "transkribus" not in x)] # uncomment to exclude transkribus models
score_df.sort_values("model").plot.bar(x='score', y='value', color='model', barmode="group")

In [21]:
model_score_sum = {"model": [], "score_sum": []}
for model, df_ in score_df.groupby("model"):
    score_sum = df_.value.sum()
    model_score_sum["model"].append(model)
    model_score_sum["score_sum"].append(score_sum)

pd.DataFrame(model_score_sum).sort_values("score_sum").plot.bar(x="model", y="score_sum", color="model")

# Find test set examples where models perform well and not so well

In [14]:
names = [e.model_name[0] for e in dfs]
dfs_ = [df for name, df in zip(names, dfs) if "sme" not in name]
correct_predictions = pd.Series([0]*len(dfs_[0]))

for df in dfs_:
    correct_predictions += df.correct_prediction

num_dfs = len(dfs_)

all_correct = correct_predictions.apply(lambda x: x == num_dfs)
none_correct = correct_predictions.apply(lambda x: x == 0)

all_correct_df = dfs_[0][all_correct][["ground_truth", "image"]]
none_correct_df = dfs_[0][none_correct][["ground_truth", "image"]]
len(all_correct_df), len(none_correct_df)

(3, 21)

In [15]:
from PIL import Image
 
testdata_path = Path("data/test")

streker = {'-', '–', '—', '_'}  
contains_streker = 0

for e in none_correct_df.itertuples():
    chars = set(e.ground_truth)
    if streker.intersection(chars):
        contains_streker += 1
        # img = Image.open(testdata_path / e.image)
        # print("\n######################")
        # display(img)
        # print(e.image)
        # print(e.ground_truth) 

    # else:
    #     img = Image.open(testdata_path / e.image)
    #     print("\n######################")
    #     display(img)
    #     print(e.image)
    #     print(e.ground_truth) 

len(none_correct_df), contains_streker

(21, 9)

In [16]:
# from PIL import Image

# testdata_path = Path("data/test")

# for e in all_correct_df.itertuples():
#     img = Image.open(testdata_path / e.image)
#     print("\n######################")
#     display(img)
#     print(e.ground_truth)

In [17]:
# from PIL import Image

# testdata_path = Path("data/test")

# for e in none_correct_df.itertuples():
#     img = Image.open(testdata_path / e.image)
#     print("\n######################")
#     display(img)
#     print(e.ground_truth)
