In [0]:
# Run this cell only for Colab users
!unzip data.zip
# Otherwise, put the /data folder (containing two subfolders of 100 .txt files)
# under the working directory.
# Now we should have /data under the working directory.

In [0]:
# Upload correction_lib.py and Create_Words_Dictionary.py to Colab,
# or make sure they are in the same directory of this file (for local machine).
# Now, import correction_lib and Create_Words_Dictionary modules. 
import correction_lib as corr
import Create_Words_Dictionary as create_dict

In [0]:
import string
import glob
import os
import itertools
import collections
import timeit

In [0]:
def clean_word(w):
    out = []
    for c in w:
        c = c.lower()
        # Searching set is faster than list: O(1) vs. O(n=26)
        if c in set(string.ascii_lowercase):
            out.append(c)
    return(''.join(out))
# clean_word('Caat13.#abE')

In [0]:
def char_to_index(c):
    return(ord(c) - ord('a'))
# char_to_index('i')

In [0]:
# print matrices/digrams in a clear manner
def print_matrix(matrix):
    alphabet = ' ' + string.ascii_lowercase
    print('  '.join(alphabet))
    for i in range(len(matrix)):
        print(chr(ord('a')+i), matrix[i])
# print_digram(digrams_by_len[3][(0, 1)])

#Error Detection

In [7]:
# create a list of words from ground truth; include repeatition and order of words
word_list = []
gt_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'ground_truth', '*.txt'))

count = 0
for gt_f in gt_filenames:
    with open(gt_f) as file:
        raw = file.read()
        # Split file content into words (by '\n', '\t', ' ', etc.)
        uncleaned_words = raw.split()
        # Clean up words, leave only all-alpha chars of length > 1 (function programming)
        word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(word_list))
print(word_list[:20])

word_set = set(word_list)
print(len(word_set))
print(list(word_set)[:20])

277888
['terminals', 'for', 'use', 'with', 'aluminum', 'andor', 'copper', 'conductors', 'cma', 'objected', 'to', 'proposal', 'that', 'this', 'standard', 'be', 'recognized', 'as', 'an', 'american']
15702
['labeling', 'cort', 'technically', 'waterhouse', 'traumatic', 'initiates', 'gulations', 'present', 'ola', 'adoption', 'misunderstandings', 'hiring', 'switching', 'registry', 'concentrates', 'properlyimage', 'moving', 'counters', 'sri', 'antiestablishment']


In [0]:
# Categorize ground truth words by their length
group_by_len = collections.defaultdict(list)
for w in word_set:
    group_by_len[len(w)].append(w)

# A dictionary of positional binary digrams (matrices),
# ordered by word length and then by binary positions
digrams_by_len = collections.defaultdict(dict)
for length in group_by_len:
    for i, j in itertools.combinations(range(length), 2):
        key = (i, j)
        matrix = [[0] * 26 for _ in range(26)]
        for w in group_by_len[length]:
            matrix[char_to_index(w[i])][char_to_index(w[j])] = 1
        digrams_by_len[length][key] = matrix
#     print(length, len(digrams_by_len[length].keys()))

In [9]:
# Create a list of words from tesseract text; regard repeatition and order of words
tr_word_list = []
tr_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'tesseract', '*.txt'))
for tr_f in tr_filenames:
    with open(tr_f) as file:
        raw = file.read()
        uncleaned_words = raw.split()
        tr_word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
print(len(tr_word_list), '\n', tr_word_list[:30])

266753 
 ['emlnals', 'for', 'use', 'hlth', 'alumlnum', 'andor', 'copper', 'conductors', 'cm', 'objected', 'to', 'that', 'thls', 'standard', 'be', 'recognlzed', 'as', 'an', 'amerlcan', 'natlonal', 'standar', 'gauss', 'our', 'new', 'the', 'test', 'procedure', 'should', 'be', 'further']


In [10]:
# A list of 3-tuples, each consisting of (detected error, left word, right word)
detected_error_tuples = []
for idx, w in enumerate(tr_word_list):
    error = False
    for i, j in itertools.combinations(range(len(w)), 2):
#         print(i, j, w[i], w[j], len(digrams_by_len[len(w)][(i, j)]), len(digrams_by_len[len(w)][(i, j)][0]))
        if not digrams_by_len[len(w)][(i, j)][char_to_index(w[i])][char_to_index(w[j])]:
            error = True
    if error:
        left = tr_word_list[i-1] if i > 0 else ''
        right = tr_word_list[i+1] if i < len(tr_word_list)-1 else ''
        detected_error_tuples.append((w, left, right))

# A list of detected error words
detected_error_words = [x[0] for x in detected_error_tuples]
        
print(len(detected_error_tuples), len(detected_error_words))        
print(detected_error_tuples[:10])
print(detected_error_words[:10])

50864 50864
[('emlnals', 'alumlnum', 'copper'), ('hlth', 'for', 'hlth'), ('alumlnum', 'andor', 'conductors'), ('recognlzed', 'conductors', 'objected'), ('natlonal', 'andor', 'conductors'), ('gauss', 'use', 'alumlnum'), ('testlng', 'alumlnum', 'copper'), ('eleld', 'use', 'alumlnum'), ('condltlons', 'conductors', 'objected'), ('leglslatlon', 'cm', 'to')]
['emlnals', 'hlth', 'alumlnum', 'recognlzed', 'natlonal', 'gauss', 'testlng', 'eleld', 'condltlons', 'leglslatlon']


#Error Correction

In [0]:
# Returns a list of (correction candidates, changed letters),
# ordered by types of correction,
# given a word and a ground truth word set
def get_correction_candidates(w, word_set):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    candidate_list = [[] for _ in range(4)]
    
    ### 4 kinds of correction candidates (see Table 2, C-4)
    # 0. Deletion
    for i in range(len(w) + 1):
        for c in alphabet:
            correction = w[:i] + c + w[i:]
            if correction in word_set:
                candidate_list[0].append(correction)
            
    # 1. Insertion
    for i in range(len(w)):
        correction = w[:i] + w[i+1:]
        if correction in word_set:
            candidate_list[1].append(correction)
    
    # 2. Substitution
    for i in range(len(w)):
        for c in alphabet:
            if c != w[i]:
                correction = w[:i] + c + w[i+1:]
                if correction in word_set:
                    candidate_list[2].append(correction)
            
    # 3. Reversal
    for i in range(len(w) - 1):
        correction = w[:i] + w[i+1] + w[i] + w[i+2:]
        if correction in word_set:
            candidate_list[3].append(correction)
    
    # Add letter differences to each entry of the list returned
    if candidate_list[0]:
        for j, w in enumerate(candidate_list[0]):
            letters = corr.find_deletion_letters(w, word)
            candidate_list[0][j] = (w, letters['pre_letter'], letters['delete_letter'])

    if candidate_list[1]:
        for j, w in enumerate(candidate_list[1]):
            letters = corr.find_insertion_letters(w, word)
            candidate_list[1][j] = (w, letters['pre_letter'], letters['insert_letter'])

    if candidate_list[2]:
        for j, w in enumerate(candidate_list[2]):
            letters = corr.find_sub_rev_letters(w, word)
            if letters['tag'] == 'sub':
                candidate_list[2][j] = (w, letters['pre_letter'], letters['changed_letter'])

    if candidate_list[3]:
        for j, w in enumerate(candidate_list[3]):
            letters = corr.find_sub_rev_letters(w, word)
            if letters['tag'] == 'rev':
                candidate_list[3][j] = (w, letters['pre_letter'], letters['changed_letter'])
    
#     print(candidate_list)
    return(candidate_list)

In [12]:
# Create a dictionary of all candidates of each detected error

start = timeit.default_timer()

all_candidates = collections.defaultdict(dict)
for word in detected_error_words:
    all_candidates[word] = get_correction_candidates(word, word_set)

stop = timeit.default_timer()

print('Time:', stop - start, 'seconds')
print(len(detected_error_words))
print(len(all_candidates))
print(all_candidates)

Time: 10.844152521000069 seconds
50864
10852


In [0]:
# Compute Pr(c), estimated by ELE (expected LE)
def get_Pr_c(correction):
    return(corr_probs[correction])

In [0]:
# Create confusion matrices
confusion_matrices = corr.Create_Confusion_Matrix()
# print_matrix(Confusion["Deletion_Confusion"])
# print_matrix(Confusion["Insertion_Confusion"])
# print_matrix(Confusion["Substitution_Confusion"])
# print_matrix(Confusion["Reversal_Confusion"])

In [15]:
# Useful values according to section 3 of paper C-4.
N = len(word_list)
V = len(word_set)
denominator = N + V/2

# A dictionary of frequecies of words in the ground truth
word_freqs = collections.defaultdict(int)
for word in word_list:
    word_freqs[word] += 1
# print(dict((k, v) for k, v in word_freqs.items() if v >= 2))

# Pr(c) of all possible corrections (all words from ground truth)
corr_probs = collections.defaultdict(float)
for word, freq in word_freqs.items():
    corr_probs[word] = (freq + 0.5)/denominator
print(corr_probs)



In [43]:
# A dictionary, the key of which is the words in ground truth,
# the value to each key records the two neighbor words and their frequencies.
neighbor_dict = create_dict.Create_Words_Dictionary()
neighbor_dict['reliable']
# print(dict((k, v) for k, v in Words_Dictionary['increased'].items if len(v) >= 2))

{'left': {'and': 4,
  'are': 1,
  'from': 1,
  'more': 1,
  'no': 1,
  'patents': 1,
  'provide': 1,
  'that': 1},
 'right': {'and': 1,
  'assets': 1,
  'basis': 1,
  'data': 2,
  'human': 1,
  'long': 1,
  'national': 1,
  'service': 1,
  'sources': 1,
  'we': 1}}

In [51]:
cand = 'reliable'
left = 'and'
right = 'dada'
method = 'ELE'
# r = 0
# if neighbor in neighbor_dict[word][position]:
#     r = neighbor_dic[word][left_word]['left'] * neighbor_dic[word][left_word]['right']
# if method == 'MLE':
#     return(r)
# else: # method == 'ELE'
#     return(r + 0.5)

def get_Pr_context_correction(cand, left, right, method):
    """
    word:   a correction candidate
    left:   the left neighbor of the error word in tesseract
    right:  the right neighbor of the error word in tesseract
    method: 'MLE', or 'ELE' where r = freq + 0.5
    return: ELE of Pr(l|c) * Pr(r|c)
    """
    # r_left = freq of left appearing, r_right = freq of right appearing
    r_left, r_right = 0, 0
    if left in neighbor_dict[cand]['left']:
        r_left = neighbor_dict[cand]['left'][left]
    if right in neighbor_dict[cand]['right']:
        r_right = neighbor_dict[cand]['right'][right]
        
    if method == 'MLE':
        return(r_left*r_right)
    else: # method == 'ELE'
        return((r_left + 0.5)*(r_right + 0.5))

print(get_Pr_context_correction(cand, left, right, method))
# print(get_Pr_context_c(word, neighbor, 'left', 'ELE'))

2.25


In [0]:
def get_Pr_final_correction(cand, neighbor, position, method):
    return 0



In [0]:
# chars[x] and chars[xy]
chars_x = [0] * 26
chars_xy = [[0] * 26 for _ in range(26)]
for word in word_list:
    for i, c in enumerate(word):
        chars_x[char_to_index(c)] += 1
        if not i:
            chars_xy[char_to_index(word[i-1])][char_to_index(c)] += 1
# print(chars_x)
# print_matrix(chars_xy)

In [40]:
# word = 'reliable'
# neighbor = 'that'
    
# def get_r(word, neighbor, position):
#     """
#     word:     a word from ground truth
#     neighbor: the word to the left/right of word
#     position: 'left' or 'right'
    
#     return: r
#     """
#     if neighbor in Words_Dictionary[word][position]:
#         return Words_Dictionary[word][position][neighbor]
#     return 0

# def get_Nr(r, word, position):
#     """
#     r:        the value of r given by get_r() 
#     word:     a word from ground truth
#     position: 'left' or 'right'
    
#     return: r
#     """
#     # If r == 0, we compute Nr = N_0 = V - sum of Nr for all r, where V == len(word_set),
#     # i.e. # left words that never appears
#     if not r:
#         return V - sum(Words_Dictionary[word][position].values())
#     # If r!= 0, we compute Nr = # left words that have frequency r
#     Nr = 0
#     for lword in Words_Dictionary[word][position]:
#         if Words_Dictionary[word][position][lword] == r:
#             Nr += 1
#     return Nr
# # print(get_Nr(0, word, 'right'))

# def get_r_star(r, word, position):
#     print((r + 1), get_Nr(r + 1, word, position), get_Nr(r, word, position))
#     return (r + 1)*get_Nr(r + 1, word, position) / get_Nr(r, word, position)

# print(get_r_star(2, word, 'right'))


# # print(get_r(word, neighbor, 'left'))
# # r_star = (r + 1)
a = collections.defaultdict(int)
a[1234]

0