### Purpose of this script

Create frequency matrices of insertions, deletions, and substitutions in hand-aligned OCRed text

## CODE

In [1]:
import numpy as np
import pandas as pd
import Levenshtein as lv
import io, os, re, sys

In [2]:
## import txt data into dictionary
path = "C:/Users/nicol/OneDrive/Documents/Education/Graduate - UCD/2020 Summer/GSR Spanish Corpus/"

ocr_dict = {}
with io.open((path + '/data/ground-truth-sets/levenshtein-training-data-cleaned.txt'), encoding = 'utf-8') as f:
    for line in f:
        (key, val) = line.split()
        ocr_dict[key] = val

## check dict format
print(ocr_dict.items())

dict_items([('y', 'y'), ('anestro', 'nuestro'), ('objeto', 'objeto'), ('al', 'al'), ('redactar', 'redactar'), ('este', 'este'), ('periódico', 'frances'), (',', ','), ('has', 'las'), ('doctrinas', 'doctrinas'), ('subyersivas', 'subversivas'), ('del', 'del'), ('órden', 'órden'), ('Hestea', 'nuestra'), ('garantia', 'garantía'), ('la', 'la'), ('Constitucion', 'Constitucion'), ('irrelijiosas', 'irrelijiosas'), ('son', 'son'), ('el', 'el'), ('primer', 'primer'), ('mal', 'mal'), ('social', 'social'), ('ellas', 'ellas'), ('se', 'se'), ('proclaman', 'proclaman'), ('propagan', 'propagan'), ('por0', 'por'), ('tido', 'partido'), ('que', 'que'), ('desgraciadamente', 'desgraciadamente'), ('para', 'para'), ('nosotros', 'nosotros'), ('tomó', 'tomó'), ('wr', 'una'), ('posicion', 'posicion'), ('ventaja', 'ventajosa'), ('muestra', 'nuestra'), ('sociedad', 'sociedad'), ('desde', 'desde'), ('dias', 'dias'), ('de', 'de'), ('independencia', 'independencia'), ('en', 'en'), ('segunda', 'segunda'), ('épode', 'é

In [4]:
## get number of unique characters
def unique(list1):
    list_set = set(list1) # get unique values from a list
    unique_list = list(list_set) # convert set to list
    
    return unique_list

chars = []
with io.open((path + '/data/ground-truth-sets/levenshtein-training-data-cleaned.txt'), encoding = 'utf-8') as f:
    for line in f:
        pairs = line.split()
        for word in pairs:
            for char in word:
                chars.append(char)

unique_chars = unique(chars)
unique_chars = sorted(unique_chars) # alphabetize char list
print(unique_chars)
num_char = len(unique_chars)
print(num_char)

## build character index
char_index = {char: i for i, char in enumerate(unique_chars)}
print(char_index)

['!', '%', "'", '(', ')', ',', '.', '0', '1', '4', '6', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '»', '¿', 'Á', 'É', 'Í', 'Ó', 'á', 'é', 'í', 'ñ', 'ó', 'ú', '“', '”', '€']
79
{'!': 0, '%': 1, "'": 2, '(': 3, ')': 4, ',': 5, '.': 6, '0': 7, '1': 8, '4': 9, '6': 10, '8': 11, '9': 12, ':': 13, ';': 14, '=': 15, '?': 16, 'A': 17, 'B': 18, 'C': 19, 'D': 20, 'E': 21, 'F': 22, 'G': 23, 'H': 24, 'I': 25, 'J': 26, 'L': 27, 'M': 28, 'N': 29, 'O': 30, 'P': 31, 'Q': 32, 'R': 33, 'S': 34, 'T': 35, 'V': 36, 'Y': 37, 'a': 38, 'b': 39, 'c': 40, 'd': 41, 'e': 42, 'f': 43, 'g': 44, 'h': 45, 'i': 46, 'j': 47, 'l': 48, 'm': 49, 'n': 50, 'o': 51, 'p': 52, 'q': 53, 'r': 54, 's': 55, 't': 56, 'u': 57, 'v': 58, 'w': 59, 'x': 60, 'y': 61, 'z': 62, '¡': 63, '»': 64, '¿': 65, 'Á': 66, 'É': 67

In [5]:
## initialize array
sub_matrix = np.zeros((num_char, num_char))
insert_matrix = np.zeros((1, num_char))
delete_matrix = np.zeros((1, num_char))

In [6]:
## get Levenshtein edits

subs = 0
inserts = 0
deletes = 0

for line in ocr_dict.items():
    ocr = line[0]
    corr = line[1]
    edit = lv.editops(ocr, corr)
    for e in edit:
        if e[0] == 'replace': # if the edit is a substitution
            ocr_sub = ocr[e[1]] # index char substituted in ocr string
            corr_sub = corr[e[2]] # index char substituted in corr string
            ocr_sub_index = char_index[ocr_sub] # index ocr char in matrix
            corr_sub_index = char_index[corr_sub] # index corr char in matrix
            sub_matrix[ocr_sub_index][corr_sub_index] += 1 # add 1 to freq of substitution
            subs += 1
        elif e[0] == 'insert':
            corr_insert = corr[e[2]] # index char inserted in corr string
            corr_insert_index = char_index[corr_insert] # index char in matrix
            insert_matrix[0][corr_insert_index] += 1
            inserts += 1
        elif e[0] == 'delete':
            corr_delete = ocr[e[2]] # index char originally in ocr string but deleted in corr string
            corr_delete_index = char_index[corr_delete] # index char in matrix
            delete_matrix[0][corr_delete_index] =+ 1
            deletes += 1

In [7]:
## get total number of errors
print(subs + inserts + deletes)

777


In [26]:
## check success of matrices
print(sub_matrix)
print(insert_matrix)
print(delete_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
   0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0. 20.  2.  5.  5. 12.  2.  1.  4.  6.  1.  4.  3. 11.  9.  1.  1.
   6. 12.  6.  5.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.
   0.  0.  0.  1.  0.  0.  0.]]
[[0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0.
  1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0.]]


In [27]:
## calculate relative frequencies
sub_freq = sub_matrix[:]/subs
insert_freq = insert_matrix[:]/inserts
delete_freq = delete_matrix[:]/deletes

In [28]:
## export data
path = "C:/Users/nicol/OneDrive/Documents/Education/Graduate - UCD/2020 Summer/GSR Spanish Corpus/"

sub_df = pd.DataFrame(sub_freq, index=char_index, columns=char_index)
insert_df = pd.DataFrame(insert_freq, columns=char_index)
delete_df = pd.DataFrame(delete_freq, columns=char_index)

## dfs to .xlsx
sub_df.to_excel((path + '/ocr-post-processing/levenshtein-edits/substitution-freq.xlsx'))
insert_df.to_excel((path + '/ocr-post-processing/levenshtein-edits/insertion-freq.xlsx'))
delete_df.to_excel((path + '/ocr-post-processing/levenshtein-edits/deletion-freq.xlsx'))

## arrays and char_index to .npy
np.save((path + '/ocr-post-processing/levenshtein-edits/sub-freq.npy'), sub_freq)
np.save((path + '/ocr-post-processing/levenshtein-edits/insert-freq.npy'), insert_freq)
np.save((path + '/ocr-post-processing/levenshtein-edits/delete-freq.npy'), delete_freq)
np.save((path + '/ocr-post-processing/levenshtein-edits/char-index.npy'), char_index)