In [None]:
#Notebook to create symspell dict from Gotland namelist and use this dict to spell-correct predict-file of Gotland name model, and then evaluate the model after it's been spell-corrected

In [1]:
import pandas as pd
import numpy as np

In [2]:
"""
Tool to metrics calculation through data and label (string and string).
 * Calculation from Optical Character Recognition (OCR) metrics with editdistance.
"""

import string
import unicodedata
import editdistance
import numpy as np


def ocr_metrics(predicts, ground_truth, norm_accentuation=False, norm_punctuation=False):
    """Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence Error Rate (SER)"""

    if len(predicts) == 0 or len(ground_truth) == 0:
        return (1, 1, 1)

    cer, wer, ser = [], [], []

    for (pd, gt) in zip(predicts, ground_truth):
        pd, gt = pd.lower(), gt.lower()

        if norm_accentuation:
            pd = unicodedata.normalize("NFKD", pd).encode("ASCII", "ignore").decode("ASCII")
            gt = unicodedata.normalize("NFKD", gt).encode("ASCII", "ignore").decode("ASCII")

        if norm_punctuation:
            pd = pd.translate(str.maketrans("", "", string.punctuation))
            gt = gt.translate(str.maketrans("", "", string.punctuation))

        pd_cer, gt_cer = list(pd), list(gt)
        dist = editdistance.eval(pd_cer, gt_cer)
        cer.append(dist / (max(len(pd_cer), len(gt_cer))))

        pd_wer, gt_wer = pd.split(), gt.split()
        dist = editdistance.eval(pd_wer, gt_wer)
        wer.append(dist / (max(len(pd_wer), len(gt_wer))))

        pd_ser, gt_ser = [pd], [gt]
        dist = editdistance.eval(pd_ser, gt_ser)
        ser.append(dist / (max(len(pd_ser), len(gt_ser))))

    metrics = [cer, wer, ser]
    metrics = np.mean(metrics, axis=1)

    return metrics


In [3]:
with open('/home/erik/Riksarkivet/Projects/handwritten-text-recognition/output/1930_census_name_gotland_original/flor/predict.txt', 'r') as f:
    
    lines = f.readlines()
    
    predicts = [x.replace('TE_P', '') for x in lines if x.startswith('TE_P')]
    gts = [x.replace('TE_L', '') for x in lines if x.startswith('TE_L')]
    
    predicts_clean = []
    gts_clean = []
    
    for pred in predicts:
        index = 0
        for i, char in enumerate(pred):
            if char.isupper():
                index = i
                break
        predicts_clean.append(pred[i:])

    for gt in gts:
        index = 0
        for i, char in enumerate(gt):
            if char.isupper():
                index = i
                break
        gts_clean.append(gt[i:].strip())


    assert len(gts_clean) == len(predicts_clean)

In [4]:
df = pd.read_excel('/home/erik/Riksarkivet/Projects/handwritten-text-recognition/data/namelist_gotland/namn_gotland.xlsx')
df.fillna(np.nan, inplace=True)

In [5]:
with open('/home/erik/Riksarkivet/Projects/handwritten-text-recognition/data/namelist_gotland/names_corpus.txt', 'w') as f:

    for i in range(0, df.shape[0]):
        
        row = df.iloc[i]
        name = str(row['name_propercase'])

        freq = 0 
        
        if not np.isnan(row['lastnamn_tot']):
            freq += int(row['lastnamn_tot'])

        if not np.isnan(row['förnamn_man']):
            freq += int(row['förnamn_man'])

        if not np.isnan(row['förnamn_kvinna']):
            freq += int(row['förnamn_kvinna'])
        
        f.write(name + '$' + str(freq) + '\n')


    

In [6]:
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(count_thre)
dictionary_path = '/home/erik/Riksarkivet/Projects/handwritten-text-recognition/data/namelist_gotland/names_corpus.txt'
sym_spell.load_dictionary(dictionary_path, 0, 1, separator="$")



True

In [10]:
sc_predicts = []

for pred in predicts_clean:

    names = pred.split()

    names_sc = []

    for name in names:
    
        suggestions = sym_spell.lookup(name, Verbosity.CLOSEST, max_edit_distance=2)
        if len(suggestions) > 0:
            names_sc.append(suggestions[0].term)
        else:
            names_sc.append(name)

    sc_predicts.append(' '.join(names_sc).strip())

sc_predicts.pop(1218)
gts_clean.pop(1218)

assert sc_predicts[0] == gts_clean[0]

print(sc_predicts[0])
print(gts_clean[0])

    #sc_predicts = [x for x in sc_predicts if x != '']
    #gts_clean = [x for x in gts_clean if x != '']

Britta Anna Kristina
Britta Anna Kristina


In [8]:
evaluate = ocr_metrics(predicts=sc_predicts, ground_truth=gts_clean)

e_corpus = "\n".join([
                f"Character Error Rate: {evaluate[0]:.8f}",
                f"Word Error Rate:      {evaluate[1]:.8f}",
                f"Sequence Error Rate:  {evaluate[2]:.8f}"
])

print(e_corpus)

Character Error Rate: 0.04963380
Word Error Rate:      0.12659321
Sequence Error Rate:  0.27401894
