In [0]:
# For Colab only
!unzip data.zip
# import sys
# print(sys.version)

In [0]:
import glob
import os
import itertools
import collections

In [0]:
def clean_word(w):
    out = []
    for c in w:
        c = c.lower()
        # searching set is faster than list: O(1) vs. O(n=26)
        if c in set('abcdefghijklmnopqrstuvwxyz'):
            out.append(c)
    return ''.join(out)
# clean_word('Caat13.#abE')

In [20]:
# create a set of words from ground truth; disregard repeatition and order of words
word_set = set()
# word_set2 = set()
gt_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'ground_truth', '*.txt'))
for gt_f in gt_filenames:
    with open(gt_f) as file:
        raw = file.read()
        # Split file content into words (by '\n', '\t', ' ', etc.)
        uncleaned_words = raw.split()
        print(uncleaned_words)
        # Clean up words, leave only all-alpha chars of length > 1
#         for w in uncleaned_words:
#             cw = clean_word(w)
#             if len(cw) > 1:
#                 word_set2.add(cw)
        # Clean up words, leave only all-alpha chars of length > 1 (function programming)
        word_set |= set(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(word_set), word_set)

0 set()


In [0]:
def char_to_index(c):
    return ord(c) - ord('a')
char_to_index('')

-47

In [0]:
group_by_len = collections.defaultdict(list)
for w in word_set:
    group_by_len[len(w)].append(w)
    
digrams_by_len = collections.defaultdict(dict)
for length in group_by_len:
    for i, j in itertools.combinations(range(length), 2):
        key = (i, j)
        matrix = [[0] * 26 for _ in range(26)]
        for w in group_by_len[length]:
            matrix[char_to_index(w[i])][char_to_index(w[j])] = 1
        digrams_by_len[length][key] = matrix
#     print(length, len(digrams_by_len[length].keys()))

In [0]:
def print_digram(matrix):
    alphabet = ' abcdefghijklmnopqrstuvwxyz'
    print(', '.join(alphabet))
    for i in range(26):
        print(chr(ord('a')+i), matrix[i])

In [0]:
# create a list of words from tesseract text; regard repeatition and order of words
tr_word_list = []
tr_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'tesseract', '*.txt'))
for tr_f in tr_filenames:
    with open(tr_f) as file:
        raw = file.read()
        uncleaned_words = raw.split()
        tr_word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
        
print(tr_word_list[:-30:-1])

['actlvltles', 'packaglng', 'and', 'dlstrlbutlon', 'transportatlon', 'nvolvement', 'and', 'commltment', 'mca', 'for', 'need', 'contlnued', 'the', 'lllustrate', 'helps', 'belleve', 'we', 'efforts', 'companys', 'one', 'only', 'phaslzes', 'though', 'program', 'thls', 'summarlze', 'to', 'developed', 'was']


In [0]:
detected_error_words = []
for idx, w in enumerate(tr_word_list):
    error = False
    for i, j in itertools.combinations(range(len(w)), 2):
#         print(i, j, w[i], w[j], len(digrams_by_len[len(w)][(i, j)]), len(digrams_by_len[len(w)][(i, j)][0]))
        if not digrams_by_len[len(w)][(i, j)][char_to_index(w[i])][char_to_index(w[j])]:
            error = True
    if error:
        left = tr_word_list[i-1] if i > 0 else ''
        right = tr_word_list[i+1] if i < len(tr_word_list)-1 else ''
        detected_error_words.append((w, left, right))

print(detected_error_words[:30])

[('organlzatlonal', 'carollna', 'kentucky'), ('restructurlng', 'th', 'and'), ('carollna', 'or', 'meetlngs'), ('asslstd', 'organlzatlonal', 'restructurlng'), ('membershlp', 'meetlngs', 'so'), ('dhlo', 'worked', 'slte'), ('lettr', 'on', 'organlzatlonal'), ('urglng', 'slte', 'or'), ('partlclpatlon', 'th', 'and'), ('geraldlne', 'restructurlng', 'callfornla'), ('representatlves', 'and', 'am'), ('valously', 'or', 'meetlngs'), ('malntalned', 'meetlngs', 'so'), ('leadrshlp', 'restructurlng', 'callfornla'), ('dlstrlbuted', 'callfornla', 'th'), ('metlng', 'slte', 'or'), ('summarlzes', 'meetlngs', 'so'), ('meetlng', 'organlzatlonal', 'restructurlng'), ('drganzatlons', 'so', 'carollna'), ('analzes', 'organlzatlonal', 'restructurlng'), ('crlthues', 'or', 'meetlngs'), ('meetlng', 'organlzatlonal', 'restructurlng'), ('provlds', 'organlzatlonal', 'restructurlng'), ('gb', '', 'worked'), ('nput', 'worked', 'slte'), ('fertlllzatlon', 'th', 'and'), ('aklng', 'on', 'organlzatlonal'), ('clrculatlon', 'callf

In [16]:
# print(char_to_index('ﬁ'))
def feature_7(w):
    return 1 if len(w) > 20 else 0

[1]