#Error Detection

In [0]:
# Run this cell only for Colab users
!unzip data.zip
# Otherwise, put the /data folder (containing two subfolders of 100 .txt files)
# under the working directory.
# Now we should have /data under the working directory.

In [0]:
import string
import glob
import os
import itertools
import collections

In [0]:
def clean_word(w):
    out = []
    for c in w:
        c = c.lower()
        # Searching set is faster than list: O(1) vs. O(n=26)
        if c in set(string.ascii_lowercase):
            out.append(c)
    return ''.join(out)
# clean_word('Caat13.#abE')

In [0]:
def char_to_index(c):
    return ord(c) - ord('a')
# char_to_index('i')

In [0]:
def print_digram(matrix):
    alphabet = ' abcdefghijklmnopqrstuvwxyz'
    print(', '.join(alphabet))
    for i in range(26):
        print(chr(ord('a')+i), matrix[i])
# print(digrams_by_len[3][(0, 1)])

In [27]:
# create a set of words from ground truth; disregard repeatition and order of words
word_set = set()
# word_set2 = set()
gt_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'ground_truth', '*.txt'))

count = 0
for gt_f in gt_filenames:
    with open(gt_f) as file:
        raw = file.read()
        # Split file content into words (by '\n', '\t', ' ', etc.)
        uncleaned_words = raw.split()
        # Clean up words, leave only all-alpha chars of length > 1 (classic for loop)
#         for w in uncleaned_words:
#             cw = clean_word(w)
#             if len(cw) > 1:# and len(cw) < 21:
#                 word_set2.add(cw)
        # Clean up words, leave only all-alpha chars of length > 1 (function programming)
        word_set |= set(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(word_set))
print(word_set[:20])
# print(len(word_set), len(word_set2))
# print(word_set2 - word_set)

15702 15747
{'eplchlorohydrinderlved', 'manufacturersassociation', 'benzothiazolesulfanamide', 'committeenotification', 'hydrocarbonphotochemical', 'uncertaintyvariability', 'philadelphiawilmingtonsouth', 'democratstorepublicans', 'oppositionparticularly', 'andfrhallmainfcmappopefrcwdeall', 'associationparticipated', 'mutagenicitycarcinoganlciey', 'significantdeterioration', 'confidentialinformation', 'theseerefcapytreasurer', 'dialkyldithiophosphates', 'compensationliability', 'addingsticivreporting', 'negotiationsagreements', 'leglslatlveregulatory', 'responsibilityhazardous', 'andtheseerefcapytreasureft', 'nickelcontainingcatalysts', 'transportationmaterials', 'mercaptobenzothlazole', 'congressionallymandated', 'legislationregulation', 'energyfeedstockeconomic', 'dibenzofuransdibenzodioxins', 'administrationsupported', 'photochemicallyreactive', 'industrygovemmentenvironment', 'communicationslabeling', 'lndustrialcommcrcialinstitutional', 'mutagenicitycarcinogenicity', 'dialkyldlthi

In [34]:
'''
# create a list of words from ground truth; disregard repeatition and order of words
word_list = []
gt_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'ground_truth', '*.txt'))

count = 0
for gt_f in gt_filenames:
    with open(gt_f) as file:
        raw = file.read()
        # Split file content into words (by '\n', '\t', ' ', etc.)
        uncleaned_words = raw.split()
        # Clean up words, leave only all-alpha chars of length > 1 (function programming)
        word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(word_list))
print(word_list[:20])
'''

277888
['media', 'training', 'was', 'provided', 'for', 'member', 'company', 'representatives', 'who', 'will', 'participate', 'in', 'upcoming', 'media', 'tours', 'on', 'the', 'national', 'chemical', 'response']


In [0]:
# Categorize ground truth words by their length
group_by_len = collections.defaultdict(list)
for w in word_set:
    group_by_len[len(w)].append(w)

# A dictionary of positional binary digrams (matrices),
# ordered by word length and then by binary positions
digrams_by_len = collections.defaultdict(dict)
for length in group_by_len:
    for i, j in itertools.combinations(range(length), 2):
        key = (i, j)
        matrix = [[0] * 26 for _ in range(26)]
        for w in group_by_len[length]:
            matrix[char_to_index(w[i])][char_to_index(w[j])] = 1
        digrams_by_len[length][key] = matrix
#     print(length, len(digrams_by_len[length].keys()))

In [11]:
# Create a list of words from tesseract text; regard repeatition and order of words
tr_word_list = []
tr_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'tesseract', '*.txt'))
for tr_f in tr_filenames:
    with open(tr_f) as file:
        raw = file.read()
        uncleaned_words = raw.split()
        tr_word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
        
print(tr_word_list[:30])

['medla', 'training', 'was', 'provlded', 'for', 'member', 'company', 'representatlves', 'who', 'will', 'partlclpate', 'upcomlng', 'medla', 'tours', 'on', 'the', 'natlonal', 'chemlcal', 'response', 'and', 'informatlon', 'center', 'energy', 'and', 'petrochemlcal', 'feedstock', 'regulatory', 'developments', 'and', 'response']


In [25]:
# A list of 3-tuples, each consisting of (detected error, left word, right word)
detected_error_tuples = []
for idx, w in enumerate(tr_word_list):
    error = False
    for i, j in itertools.combinations(range(len(w)), 2):
#         print(i, j, w[i], w[j], len(digrams_by_len[len(w)][(i, j)]), len(digrams_by_len[len(w)][(i, j)][0]))
        if not digrams_by_len[len(w)][(i, j)][char_to_index(w[i])][char_to_index(w[j])]:
            error = True
    if error:
        left = tr_word_list[i-1] if i > 0 else ''
        right = tr_word_list[i+1] if i < len(tr_word_list)-1 else ''
        detected_error_tuples.append((w, left, right))

# A list of detected error words
detected_error_words = [x[0] for x in detected_error_tuples]
        
print(len(detected_error_tuples), len(detected_error_words))        
print(detected_error_tuples[:10])
print(detected_error_words[:10])

50864 50864
[('representatlves', 'medla', 'on'), ('partlclpate', 'who', 'partlclpate'), ('upcomlng', 'member', 'representatlves'), ('natlonal', 'member', 'representatlves'), ('informatlon', 'who', 'partlclpate'), ('petrochemlcal', 'partlclpate', 'medla'), ('transportatlon', 'upcomlng', 'tours'), ('commlsslon', 'representatlves', 'will'), ('ferq', 'training', 'provlded'), ('nterstate', 'company', 'who')]
['representatlves', 'partlclpate', 'upcomlng', 'natlonal', 'informatlon', 'petrochemlcal', 'transportatlon', 'commlsslon', 'ferq', 'nterstate']


#Error Correction

In [13]:
def get_correction_candidates(w, word_set):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    corr_candidates = [[] for _ in range(4)]
    
    ### 4 kinds of correction candidates (see Table 2, C-4)
    # 0. Deletion
    for i in range(len(w) + 1):
        for c in alphabet:
            correction = w[:i] + c + w[i:]
            if correction in word_set:
                corr_candidates[0].append(correction)
            
    # 1. Insertion
    for i in range(len(w)):
        correction = w[:i] + w[i+1:]
        if correction in word_set:
            corr_candidates[1].append(correction)
            
    # 2. Reversal
    for i in range(len(w) - 1):
        correction = w[:i] + w[i+1] + w[i] + w[i+2:]
        if correction in word_set:
            corr_candidates[2].append(correction)
    
    # 3. Substitution
    for i in range(len(w)):
        for c in alphabet:
            if c != w[i]:
                correction = w[:i] + c + w[i+1:]
                if correction in word_set:
                    corr_candidates[3].append(correction)
    
    return corr_candidates
# cands = get_correction_candidates('voer', word_set)
# print(cands)

[[], ['ver'], ['over'], ['vour']]


In [0]:
[x[0] for x in detected_error_words]
for word in detected_error_words:

In [32]:
raw = 'aw. eg\new\twegw'
print(raw.split())


['aw.', 'eg', 'ew', 'wegw']
