In [0]:
# Run this cell only for Colab users
!unzip data.zip
# Otherwise, put the /data folder (containing two subfolders of 100 .txt files)
# under the working directory.
# Now we should have /data under the working directory.

In [0]:
# Upload correction_lib.py to Colab,
# or make sure it is in the same directory of this file.
# Now, import correction_lib module (in the /Correction directory). 
import correction_lib as corr

In [0]:
import string
import glob
import os
import itertools
import collections

In [0]:
def clean_word(w):
    out = []
    for c in w:
        c = c.lower()
        # Searching set is faster than list: O(1) vs. O(n=26)
        if c in set(string.ascii_lowercase):
            out.append(c)
    return ''.join(out)
# clean_word('Caat13.#abE')

In [0]:
def char_to_index(c):
    return ord(c) - ord('a')
# char_to_index('i')

In [0]:
# print matrices/digrams in a clear manner
def print_matrix(matrix):
    alphabet = ' ' + string.ascii_lowercase
    print('  '.join(alphabet))
    for i in range(len(matrix)):
        print(chr(ord('a')+i), matrix[i])
# print_digram(digrams_by_len[3][(0, 1)])

#Error Detection

In [7]:
# create a list of words from ground truth; include repeatition and order of words
word_list = []
gt_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'ground_truth', '*.txt'))

count = 0
for gt_f in gt_filenames:
    with open(gt_f) as file:
        raw = file.read()
        # Split file content into words (by '\n', '\t', ' ', etc.)
        uncleaned_words = raw.split()
        # Clean up words, leave only all-alpha chars of length > 1 (function programming)
        word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(word_list))
print(word_list[:20])

word_set = set(word_list)
print(len(word_set))
print(list(word_set)[:20])

277888
['wa', 'reasonable', 'although', 'it', 'substantially', 'increased', 'the', 'programs', 'and', 'authority', 'over', 'preset', 'law', 'following', 'is', 'brief', 'list', 'of', 'key', 'issues']
15702
['thd', 'driven', 'inatituta', 'ezpected', 'petro', 'giant', 'finish', 'lest', 'direct', 'overviews', 'charleston', 'istr', 'timothy', 'korea', 'face', 'sales', 'finger', 'anhydride', 'aaend', 'devastating']


In [0]:
# Categorize ground truth words by their length
group_by_len = collections.defaultdict(list)
for w in word_set:
    group_by_len[len(w)].append(w)

# A dictionary of positional binary digrams (matrices),
# ordered by word length and then by binary positions
digrams_by_len = collections.defaultdict(dict)
for length in group_by_len:
    for i, j in itertools.combinations(range(length), 2):
        key = (i, j)
        matrix = [[0] * 26 for _ in range(26)]
        for w in group_by_len[length]:
            matrix[char_to_index(w[i])][char_to_index(w[j])] = 1
        digrams_by_len[length][key] = matrix
#     print(length, len(digrams_by_len[length].keys()))

In [10]:
# Create a list of words from tesseract text; regard repeatition and order of words
tr_word_list = []
tr_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'tesseract', '*.txt'))
for tr_f in tr_filenames:
    with open(tr_f) as file:
        raw = file.read()
        uncleaned_words = raw.split()
        tr_word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(tr_word_list), '\n', tr_word_list[:30])

266753 
 ['ha', 'reas', 'able', 'although', 'substantlally', 'ncreased', 'the', 'programs', 'and', 'authorlty', 'over', 'preset', 'followl', 'brlef', 'of', 'key', 'lssues', 'that', 'were', 'consldered', 'and', 'the', 'votes', 'whlch', 'mior', 'and', 'damaglng', 'changes', 'were', 'rejected']


In [11]:
# A list of 3-tuples, each consisting of (detected error, left word, right word)
detected_error_tuples = []
for idx, w in enumerate(tr_word_list):
    error = False
    for i, j in itertools.combinations(range(len(w)), 2):
#         print(i, j, w[i], w[j], len(digrams_by_len[len(w)][(i, j)]), len(digrams_by_len[len(w)][(i, j)][0]))
        if not digrams_by_len[len(w)][(i, j)][char_to_index(w[i])][char_to_index(w[j])]:
            error = True
    if error:
        left = tr_word_list[i-1] if i > 0 else ''
        right = tr_word_list[i+1] if i < len(tr_word_list)-1 else ''
        detected_error_tuples.append((w, left, right))

# A list of detected error words
detected_error_words = [x[0] for x in detected_error_tuples]
        
print(len(detected_error_tuples), len(detected_error_words))        
print(detected_error_tuples[:10])
print(detected_error_words[:10])

50864 50864
[('substantlally', 'over', 'followl'), ('ncreased', 'ncreased', 'programs'), ('followl', 'substantlally', 'the'), ('brlef', 'able', 'substantlally'), ('consldered', 'programs', 'authorlty'), ('damaglng', 'ncreased', 'programs'), ('mandato', 'substantlally', 'the'), ('mlght', 'able', 'substantlally'), ('envlronment', 'and', 'over'), ('requlre', 'substantlally', 'the')]
['substantlally', 'ncreased', 'followl', 'brlef', 'consldered', 'damaglng', 'mandato', 'mlght', 'envlronment', 'requlre']


#Error Correction

In [0]:
def get_correction_candidates(w, word_set):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    candidate_list = [[] for _ in range(4)]
    
    ### 4 kinds of correction candidates (see Table 2, C-4)
    # 0. Deletion
    for i in range(len(w) + 1):
        for c in alphabet:
            correction = w[:i] + c + w[i:]
            if correction in word_set:
                candidate_list[0].append(correction)
            
    # 1. Insertion
    for i in range(len(w)):
        correction = w[:i] + w[i+1:]
        if correction in word_set:
            candidate_list[1].append(correction)
    
    # 2. Substitution
    for i in range(len(w)):
        for c in alphabet:
            if c != w[i]:
                correction = w[:i] + c + w[i+1:]
                if correction in word_set:
                    candidate_list[2].append(correction)
            
    # 3. Reversal
    for i in range(len(w) - 1):
        correction = w[:i] + w[i+1] + w[i] + w[i+2:]
        if correction in word_set:
            candidate_list[3].append(correction)
    
    # Add letter differences to each entry of the list returned
    if candidate_list[0]:
        for j, w in enumerate(v):
            letters = corr.find_insertion_letters(w, word)
            candidate_list[0][j] = (w, letters['pre_letter'], letters['delete_letter'])

    if candidate_list[1]:
        for j, w in enumerate(v):
            letters = corr.find_insertion_letters(w, word)
            candidate_list[1][j] = (w, letters['pre_letter'], letters['insert_letter'])

    if candidate_list[2]:
        for j, w in enumerate(v):
            letters = corr.find_sub_rev_letters(w, word)
            if letters['tag'] == 'sub':
                candidate_list[2][j] = (w, letters['pre_letter'], letters['changed_letter'])

    if candidate_list[3]:
        for j, w in enumerate(v):
            letters = corr.find_sub_rev_letters(w, word)
            if letters['tag'] == 'rev':
                candidate_list[3][j] = (w, letters['pre_letter'], letters['changed_letter'])
    
    return candidate_list
# cands = get_correction_candidates('voer', word_set)
# print(cands)

In [52]:
# Codes for computing Pr(t|c)
word = 'representatlves'
candidate_list = get_correction_candidates(word, word_set)
print(candidate_list)

all_candidates = []
for word in detected_error_words:
    all_candidates.append(get_correction_candidates(word, word_set))
    
print()
    


[[], [('representatives', '', '')], [('representatives', 'i', 'l')], []]


In [0]:
# Compute Pr(c), estimated by ELE (expected LE)
def get_Pr_c(correction):
    return corr_probs[correction]

In [0]:
# Create confusion matrices
confusion_matrices = corr.Create_Confusion_Matrix()
# print_matrix(Confusion["Deletion_Confusion"])
# print_matrix(Confusion["Insertion_Confusion"])
# print_matrix(Confusion["Substitution_Confusion"])
# print_matrix(Confusion["Reversal_Confusion"])

In [0]:
# Useful values according to section 3 of paper C-4.
N = len(word_list)
V = len(word_set)
denominator = N + V/2

# A dictionary of frequecies of words in the ground truth
word_freqs = collections.defaultdict(int)
for word in word_list:
    word_freqs[word] += 1
# print(dict((k, v) for k, v in word_freqs.items() if v >= 2))

# Pr(c) of all possible corrections (all words from ground truth)
corr_probs = collections.defaultdict(float)
for word, freq in word_freqs.items():
    corr_probs[word] = (freq + 0.5)/denominator
print(corr_probs)





In [28]:
d = (1,2,3)
d[0]

1

In [0]:
# Compute Pr(t|c)
def get_Pr_tc(typo, correction, type):
    
    
    
    prob = 0
    return prob

In [0]:
# chars[x] and chars[xy]
chars_x = [0] * 26
chars_xy = [[0] * 26 for _ in range(26)]
for word in word_list:
    for i, c in enumerate(word):
        chars_x[char_to_index(c)] += 1
        if not i:
            chars_xy[char_to_index(word[i-1])][char_to_index(c)] += 1
# print(chars_x)
# print_matrix(chars_xy)

[124021, 20526, 65591, 57785, 188486, 32193, 28724, 54776, 120235, 2431, 5646, 59135, 48238, 115142, 118400, 41345, 1775, 100836, 103090, 143693, 42097, 17390, 18163, 4932, 20936, 2072]
   a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z
a [378, 27, 3341, 366, 699, 173, 25, 31, 76, 15, 12, 80, 566, 53, 252, 108, 10, 158, 140, 484, 51, 49, 20, 0, 1, 1]
b [8, 15, 23, 4, 10, 0, 0, 8, 1, 29, 0, 2, 12, 9, 11, 31, 0, 0, 33, 17, 2, 1, 0, 0, 0, 0]
c [124, 54, 326, 196, 419, 65, 60, 64, 230, 2, 1, 0, 36, 90, 72, 581, 4, 36, 394, 225, 19, 20, 5, 3, 0, 22]
d [10584, 865, 1680, 1026, 992, 709, 244, 729, 1158, 34, 23, 318, 316, 378, 340, 1709, 30, 1813, 2150, 443, 508, 123, 955, 2, 17, 0]
e [2923, 2966, 3987, 1072, 1467, 862, 287, 2033, 1608, 200, 46, 851, 1359, 293, 857, 1961, 50, 1853, 3226, 21768, 521, 358, 2397, 1, 3, 7]
f [0, 98, 25, 4, 9, 4, 9, 30, 297, 2, 1, 1, 4, 18, 10233, 19, 2, 29, 376, 74, 1, 2, 0, 0, 0, 0]
g [494, 595, 710, 799, 525, 481, 142, 225, 566, 5, 2

In [0]:
# Compute Pr(l|c)
def get_Pr_lc(left, correction):
    
    
    prob = 0
    return prob

In [0]:
# Compute Pr(r|c)
def get_Pr_rc(right, correction):
    
    
    prob = 0
    return prob