In [20]:
import correction_lib as corr
from word_dict import Create_Words_Dictionary, Create_Word_Pair
import string
import glob
import os
import itertools
import collections
import timeit
import random
import pandas as pd
import numpy as np

In [21]:
def clean_word(w):
    out = []
    for c in w:
        c = c.lower()
        # Searching set is faster than list: O(1) vs. O(n=26)
        if c in set(string.ascii_lowercase):
            out.append(c)
    return(''.join(out))

def char_to_index(c):
    if c=="{":
        return(26)
    else:
        return(ord(c) - ord('a'))

In [23]:
word_list = []
gt_filenames = glob.glob(os.path.join(os.getcwd(), '../data', 'ground_truth', '*.txt'))

count = 0
for gt_f in gt_filenames:
    with open(gt_f) as file:
        raw = file.read()
        # Split file content into words (by '\n', '\t', ' ', etc.)
        uncleaned_words = raw.split()
        # Clean up words, leave only all-alpha chars of length > 1 (function programming)
        word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
        
word_set=set(word_list)

In [25]:
group_by_len = collections.defaultdict(list)
for w in word_set:
    group_by_len[len(w)].append(w)

# A dictionary of positional binary digrams (matrices),
# ordered by word length and then by binary positions
digrams_by_len = collections.defaultdict(dict)
for length in group_by_len:
    for i, j in itertools.combinations(range(length), 2):
        key = (i, j)
        matrix = [[0] * 26 for _ in range(26)]
        for w in group_by_len[length]:
            matrix[char_to_index(w[i])][char_to_index(w[j])] = 1
        digrams_by_len[length][key] = matrix

In [26]:
#####need to.be change
# Create a list of words from tesseract text; regard repeatition and order of words
tr_word_list = []
tr_filenames = glob.glob(os.path.join(os.getcwd(), 'data', 'tesseract', '*.txt'))
for tr_f in tr_filenames:
    with open(tr_f) as file:
        raw = file.read()
        uncleaned_words = raw.split()
        tr_word_list += list(filter(lambda x: 1 < len(x) < 21, map(clean_word, uncleaned_words)))
N_tr = len(tr_word_list)
# print(N_tr, '\n', tr_word_list[:30])

In [27]:
detected_error_tuples = []
for idx, w in enumerate(tr_word_list):
    error = False
    for i, j in itertools.combinations(range(len(w)), 2):
#         print(i, j, w[i], w[j], len(digrams_by_len[len(w)][(i, j)]), len(digrams_by_len[len(w)][(i, j)][0]))
        if not digrams_by_len[len(w)][(i, j)][char_to_index(w[i])][char_to_index(w[j])]:
            error = True
    if error:
        left = tr_word_list[i-1] if i > 0 else ''
        right = tr_word_list[i+1] if i < len(tr_word_list)-1 else ''
        detected_error_tuples.append((w, left, right))

# A list of detected error words
detected_error_words = [x[0] for x in detected_error_tuples]

In [28]:
# read Confusion matrix
add_matrix=np.array(pd.read_csv("../data/confusion_matrix/add_matrix.csv",index_col=0))
del_matrix=np.array(pd.read_csv("../data/confusion_matrix/del_matrix.csv",index_col=0))
rev_matrix=np.array(pd.read_csv("../data/confusion_matrix/rev_matrix.csv",index_col=0))
sub_matrix=np.array(pd.read_csv("../data/confusion_matrix/sub_matrix.csv",index_col=0))

# Useful values from section 3 of paper C-4.
N = len(word_list)
V = len(word_set)
denominator = N + V/2

In [29]:
def get_correction_candidates(w):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    candidate_list = [[] for _ in range(4)]
    
    ### 4 kinds of correction candidates (see Table 2, C-4)
    # 0. Deletion
    for i in range(len(w) + 1):
        for c in alphabet:
            correction = w[:i] + c + w[i:]
            if correction in word_set:
                diff = corr.find_deletion_letters(correction, w) 
                candidate_list[0].append((correction, diff['pre_letter'], diff['delete_letter']))
            
    # 1. Insertion
    for i in range(len(w)):
        correction = w[:i] + w[i+1:]
        if correction in word_set:
            diff = corr.find_insertion_letters(correction, w) 
            candidate_list[1].append((correction, diff['pre_letter'], diff['insert_letter']))
    
    # 2. Substitution
    for i in range(len(w)):
        for c in alphabet:
            if c != w[i]:
                correction = w[:i] + c + w[i+1:]
                if correction in word_set:
                    diff = corr.find_sub_rev_letters(correction, w) 
                    if diff['tag'] == 'sub':
                        candidate_list[2].append((correction, diff['pre_letter'], diff['changed_letter']))
    
    # 3. Reversal
    for i in range(len(w) - 1):
        correction = w[:i] + w[i+1] + w[i] + w[i+2:]
        if correction in word_set:
            diff = corr.find_sub_rev_letters(correction, w) 
            if diff['tag'] == 'rev':
                candidate_list[3].append((correction, diff['pre_letter'], diff['changed_letter']))
    
    return(candidate_list)

In [31]:
get_correction_candidates('responder')

[[('responders', 'r', 's')], [], [('responded', 'd', 'r')], []]

In [32]:
#read detected words from detection
import re
detection=pd.read_excel('detection.xlsx')
detected_error_words=[str(i) for i in np.array(detection['tokens_tesseract'])]
detected_error_words=[''.join(re.findall('[a-z]+',i)) for i in detected_error_words]
# detection.info()

In [33]:
start = timeit.default_timer()

all_candidates = collections.defaultdict(list)
for word in detected_error_words:
    all_candidates[word] = get_correction_candidates(word)
    
stop = timeit.default_timer()
print('Time:', stop - start, 'seconds')

Time: 1.755324236999968 seconds


In [34]:
# all_candidates = collections.defaultdict(list)
temp=['admlnlstratlon','responder','responder']
# for word in temp:
#     all_candidates[word] = get_correction_candidates(word)
# all_candidates
print(detected_error_words[0:10])
print(temp)

['admlnlstratlon', 'exemptlons', 'ommlttee', 'closediloop', 'n', 'whlch', 'responder', 'authorlty', 'sso', 'olunteers']
['admlnlstratlon', 'responder', 'responder']


In [35]:
all_candidates

defaultdict(list,
            {'admlnlstratlon': [[], [], [], []],
             'exemptlons': [[], [], [('exemptions', 'i', 'l')], []],
             'ommlttee': [[('commlttee', '{', 'c')], [], [], []],
             'closediloop': [[], [('closedloop', 'd', 'i')], [], []],
             'n': [[('an', '{', 'a'),
               ('bn', '{', 'b'),
               ('cn', '{', 'c'),
               ('en', '{', 'e'),
               ('in', '{', 'i'),
               ('ln', '{', 'l'),
               ('mn', '{', 'm'),
               ('nn', 'n', 'n'),
               ('on', '{', 'o'),
               ('tn', '{', 't'),
               ('un', '{', 'u'),
               ('wn', '{', 'w'),
               ('xn', '{', 'x'),
               ('zn', '{', 'z'),
               ('nc', 'n', 'c'),
               ('nd', 'n', 'd'),
               ('ne', 'n', 'e'),
               ('ng', 'n', 'g'),
               ('nh', 'n', 'h'),
               ('nj', 'n', 'j'),
               ('nl', 'n', 'l'),
               ('nn', 'n', 'n'

In [36]:
word_freqs = collections.defaultdict(int)
for word in word_list:
    word_freqs[word] += 1
# print(dict((k, v) for k, v in word_freqs.items() if v >= 2))

# Pr(c) of all possible corrections (all words from ground truth)
word_freqs_freqs = {}
for freqs in word_freqs.values():
    if freqs in word_freqs_freqs.keys():
        word_freqs_freqs[freqs] += 1
    else:
        word_freqs_freqs[freqs] = 1
word_freqsN = {}
for k in word_freqs_freqs.keys():
    if k + 1  in word_freqs_freqs.keys():
        word_freqsN[k+1] = word_freqs_freqs[k+1]
    else:
        word_freqsN[k+1] = 0.5
N1 = 0
for j in word_freqsN.keys():
    N1 = N1 + j*word_freqsN[j]
word_freqs_freqsfinal = {}
for k in word_freqs_freqs.keys():
    word_freqs_freqsfinal[k] = word_freqs_freqs[k]
    if k + 1  in word_freqs_freqs.keys():
        word_freqs_freqsfinal[k+1] = word_freqs_freqs[k+1]
    else:
        word_freqs_freqsfinal[k+1] = 0.5

corr_probs_mle = collections.defaultdict(float)
for word, freq in word_freqs.items():
    corr_probs_mle[word] =freq/N
corr_probs_ele = collections.defaultdict(float)
for word, freq in word_freqs.items():
    corr_probs_ele[word] = (freq + 0.5)/denominator
corr_probs_gt = collections.defaultdict(float)
for word, freq in word_freqs.items():
    corr_probs_gt[word] = (freq + 1)*word_freqs_freqsfinal[freqs + 1]/word_freqs_freqsfinal[freqs]/N1

In [37]:
def get_Pr_c(correction,method):
    if correction:
        if method == "MLe":
            return(corr_probs_mle[correction])
        elif method == "ELE":
            return(corr_probs_ele[correction])
        else:
            return(corr_probs_gt[correction])
    else:
        return 0

In [38]:
# chars[x] and chars[xy]
# word_list=['abcdefg','awses']
chars_x = np.zeros(26)
chars_xy = np.zeros([27,26])
print(chars_xy.shape)
for word in word_list:
    for i, c in enumerate(word):
        chars_x[char_to_index(c)] += 1
        if i==0:
            chars_xy[26][char_to_index(c)] += 1
        else:
            chars_xy[char_to_index(word[i-1])][char_to_index(c)] += 1
for i in range(len(chars_xy)):
    for j in range(len(chars_xy[0])):
        if not chars_xy[i][j]:
            chars_xy[i][j] = 0.5

(27, 26)


In [39]:
def get_Pr_tc(pre, cur, error_type):
    if error_type == 'del':
        return del_matrix[char_to_index(pre)][char_to_index(cur)] \
               / chars_xy[char_to_index(pre)][char_to_index(cur)]
    if error_type == 'ins':
        return ins_matrix[char_to_index(pre)][char_to_index(cur)] \
               / chars_x[char_to_index(pre)]
    if error_type == 'sub':
        return sub_matrix[char_to_index(pre)][char_to_index(cur)] \
               / chars_x[char_to_index(pre)]
    if error_type == 'rev':
        return rev_matrix[char_to_index(pre)][char_to_index(cur)] \
               / chars_xy[char_to_index(pre)][char_to_index(cur)]

In [40]:
def get_Pr_context_correction(cand, left, right):
    """
   
    """
    # r_left = freq of left appearing, r_right = freq of right appearing
    r_left, r_right = 0, 0
    if cand:
        if left in neighbor_dict[cand]['left']:
            r_left = neighbor_dict[cand]['left'][left]
        if right in neighbor_dict[cand]['right']:
            r_right = neighbor_dict[cand]['right'][right]
        return(r_left*r_right)/N^2
    else:
        return 0
    

In [41]:
result = Create_Word_Pair()
word_pairs = result[:-1]
num_corrections_found_tesseract, sum_letters = result[-1][0], result[-1][1]

In [42]:
start = timeit.default_timer()
detected_error_tuples2 = []

for t in detected_error_tuples:
    for pair in word_pairs:
        if t[0] == pair[1]:
            detected_error_tuples2.append((t[0], pair[0], t[1], t[2]))
            break
stop = timeit.default_timer() 
print('Time:', stop - start, 'seconds')

Time: 0.000114714999995158 seconds


In [23]:
def get_final_Pr_correction(cand, typo, pre,cur,error_type, left, right, method):
    return get_Pr_c(cand,method) * get_Pr_tc(pre,cur,error_type) \
           * get_Pr_context_correction(cand, left, right)

err_type = [ 'ins','del', 'sub', 'rev']
# A dictionary of the form: typo:(ground_truth, MLE_correction, ELE_correction)
result = {}
for typo, truth, left, right in detected_error_tuples2:
    cand_poss_mle, cand_poss_ele,cand_poss_gt = [], [], []
    for e in range(4):
        for cand,pre,cur in all_candidates[typo][e]:
            mle_p = get_final_Pr_correction(cand, typo,pre,cur, err_type[e], left, right, 'MLE')
            ele_p = get_final_Pr_correction(cand, typo,pre,cur, err_type[e], left, right, 'ELE')
            gt_p = get_final_Pr_correction(cand, typo, pre,cur,err_type[e], left, right, 'GT') 
            cand_poss_mle.append((cand, mle_p))
            cand_poss_ele.append((cand, ele_p))
            cand_poss_gt.append((cand, gte_p))
    mle_best_cand = max(cand_poss_mle, key=lambda x: x[1])[0] if cand_poss_mle else ''
    ele_best_cand = max(cand_poss_ele, key=lambda x: x[1])[0] if cand_poss_ele else ''
    gt_best_cand = max(cand_poss_gt, key=lambda x: x[1])[0] if cand_poss_gt else ''
    result[typo] = (truth, mle_best_cand, ele_best_cand,gt_best_cand)



IndexError: list index out of range

In [87]:
detected_error_tuples2


[('provlde', 'provide', 'are', 'encouraged'),
 ('ncludng', 'certain', 'are', 'encouraged'),
 ('heavlly', 'heavily', 'are', 'encouraged'),
 ('nvolved', 'involved', 'are', 'encouraged'),
 ('crltlcal', 'critical', 'strongly', 'to'),
 ('natlonal', 'national', 'strongly', 'to'),
 ('leglslators', 'legislators', 'provlde', 'needed'),
 ('dlsposal', 'disposal', 'strongly', 'to'),
 ('contlnue', 'continue', 'strongly', 'to'),
 ('toxlc', 'toxic', 'member', 'are'),
 ('preventlon', 'prevention', 'to', 'thls'),
 ('requlres', 'requires', 'strongly', 'to'),
 ('reportlng', 'reporting', 'encouraged', 'provlde'),
 ('nformatlon', 'information', 'to', 'thls'),
 ('concernlng', 'concerning', 'to', 'thls'),
 ('ndustrys', 'industrys', 'strongly', 'to'),
 ('rlsk', 'risk', 'network', 'companles'),
 ('reductlon', 'reduction', 'encouraged', 'provlde'),
 ('cessatlon', 'cessation', 'encouraged', 'provlde'),
 ('toxlc', 'toxic', 'member', 'are'),
 ('unportant', 'important', 'encouraged', 'provlde'),
 ('lssue', 'issue',

In [109]:
 all_candidates


defaultdict(dict,
            {'admlnlstratlon': [[], [], [], []],
             'exemptlons': [[], [], [('exemptions', 'i', 'l')], []],
             'Commlttee': [[], [], [], []],
             '"closediloop"': [[], [], [], []],
             '1n': [[], [], [], []],
             'whlch': [[],
              [('wlch', 'w', 'h')],
              [('which', 'i', 'l'), ('whleh', 'e', 'c')],
              []],
             'responder.': [[], [('responder', '', '')], [], []],
             'authorlty': [[], [], [('authority', 'i', 'l')], []],
             'MIssIoN': [[], [], [], []],
             'Volunteers': [[], [], [], []],
             '1ndustry': [[], [('ndustry', '', '')], [], []],
             '9a': [[], [], [], []],
             'presented': [[], [], [('prevented', 'v', 's')], []],
             'slmllar': [[], [], [], []],
             "Assoclatlon's": [[], [], [], []],
             'slgnature': [[], [], [('signature', 'i', 'l')], []],
             'posslble': [[], [], [('possible', 'i',