In [0]:
# This cell is for Colab only
!unzip data.zip

# Now we should have /data under the working directory.

In [0]:
import glob
import os
import itertools
import collections

In [0]:
def clean_word(w):
    out = []
    for c in w:
        c = c.lower()
        # searching set is faster than list: O(1) vs. O(n=26)
        if c in set('abcdefghijklmnopqrstuvwxyz'):
            out.append(c)
    return ''.join(out)
# clean_word('Caat13.#abE')

In [27]:
# create a set of words from ground truth; disregard repeatition and order of words
word_set = set()
# word_set2 = set()
gt_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'ground_truth', '*.txt'))

count = 0
for gt_f in gt_filenames:
    with open(gt_f) as file:
        raw = file.read()
        # Split file content into words (by '\n', '\t', ' ', etc.)
        uncleaned_words = raw.split()
        # Clean up words, leave only all-alpha chars of length > 1
        for w in uncleaned_words:
            cw = clean_word(w)
            if len(cw) > 1 and len(cw) < 21:
                word_set2.add(cw)
        # Clean up words, leave only all-alpha chars of length > 1 (function programming)
        word_set |= set(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(word_set))
print(word_set)
# print(len(word_set), len(word_set2))
# print(word_set2 - word_set)

15702


In [28]:
def char_to_index(c):
    return ord(c) - ord('a')
# char_to_index('i')

8

In [0]:
group_by_len = collections.defaultdict(list)
for w in word_set:
    group_by_len[len(w)].append(w)
    
digrams_by_len = collections.defaultdict(dict)
for length in group_by_len:
    for i, j in itertools.combinations(range(length), 2):
        key = (i, j)
        matrix = [[0] * 26 for _ in range(26)]
        for w in group_by_len[length]:
            matrix[char_to_index(w[i])][char_to_index(w[j])] = 1
        digrams_by_len[length][key] = matrix
#     print(length, len(digrams_by_len[length].keys()))

In [0]:
def print_digram(matrix):
    alphabet = ' abcdefghijklmnopqrstuvwxyz'
    print(', '.join(alphabet))
    for i in range(26):
        print(chr(ord('a')+i), matrix[i])
# print(digrams_by_len[3][(0, 1)])

In [34]:
# create a list of words from tesseract text; regard repeatition and order of words
tr_word_list = []
tr_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'tesseract', '*.txt'))
for tr_f in tr_filenames:
    with open(tr_f) as file:
        raw = file.read()
        uncleaned_words = raw.split()
        tr_word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
        
print(tr_word_list[:30])

['ha', 'reas', 'able', 'although', 'substantlally', 'ncreased', 'the', 'programs', 'and', 'authorlty', 'over', 'preset', 'followl', 'brlef', 'of', 'key', 'lssues', 'that', 'were', 'consldered', 'and', 'the', 'votes', 'whlch', 'mior', 'and', 'damaglng', 'changes', 'were', 'rejected']


In [36]:
detected_error_words = []
for idx, w in enumerate(tr_word_list):
    error = False
    for i, j in itertools.combinations(range(len(w)), 2):
#         print(i, j, w[i], w[j], len(digrams_by_len[len(w)][(i, j)]), len(digrams_by_len[len(w)][(i, j)][0]))
        if not digrams_by_len[len(w)][(i, j)][char_to_index(w[i])][char_to_index(w[j])]:
            error = True
    if error:
        left = tr_word_list[i-1] if i > 0 else ''
        right = tr_word_list[i+1] if i < len(tr_word_list)-1 else ''
        detected_error_words.append((w, left, right))

print(len(detected_error_words))        
print(detected_error_words[:30])

50864
[('substantlally', 'over', 'followl'), ('ncreased', 'ncreased', 'programs'), ('followl', 'substantlally', 'the'), ('brlef', 'able', 'substantlally'), ('consldered', 'programs', 'authorlty'), ('damaglng', 'ncreased', 'programs'), ('mandato', 'substantlally', 'the'), ('mlght', 'able', 'substantlally'), ('envlronment', 'and', 'over'), ('requlre', 'substantlally', 'the'), ('prlvatejoarty', 'over', 'followl'), ('nced', 'reas', 'although'), ('requlrl', 'substantlally', 'the'), ('entlng', 'although', 'ncreased'), ('flnal', 'able', 'substantlally'), ('llablllty', 'the', 'and'), ('jolnt', 'able', 'substantlally'), ('rlghts', 'although', 'ncreased'), ('cltlzens', 'ncreased', 'programs'), ('brlng', 'able', 'substantlally'), ('dlsposal', 'ncreased', 'programs'), ('mnent', 'able', 'substantlally'), ('substant', 'ncreased', 'programs'), ('endangement', 'and', 'over'), ('nventory', 'ncreased', 'programs'), ('nghtit', 'although', 'ncreased'), ('provlslon', 'the', 'and'), ('transpo', 'substantlal

[1]