In [180]:
#import library
import os
import glob
import itertools
import collections
import pandas as pd
import random

# 0. Tool functions

In [181]:
#clean file function
def clean(words):
    out = []
    for c in words:
        c = c.lower()
        if c in set('abcdefghijklmnopqrstuvwxyz '):
            out.append(c)
    return ''.join(out)

# transfrom characters to numbers
def char_to_index(char):
    return ord(char)-ord('a')

# 1. Combine ground truth and orc files

In [182]:
#set file path
input_path = '../data/'
if not os.path.exists('../output/Detection/'):
    os.mkdir('../output/Detection/')
output_path = '../output/Detection/'

#create files of ground truth
with open(os.path.join(output_path, 'truth_combined.txt'), 'wb') as output1:
    filelist = sorted(glob.glob(os.path.join(input_path, 'ground_truth/'+'*.txt')))
    for file in filelist:
        with open(file, 'rb') as input1:
            output1.write(input1.read())

#create files of orc
with open(os.path.join(output_path, 'orc_combined.txt'), 'wb') as output2:
    filelist = sorted(glob.glob(os.path.join(input_path, 'tesseract/'+'*.txt')))
    for file in filelist:
        with open(file, 'rb') as input2:
            output2.write(input2.read())

# 2. Convert groud true files to dictionary

In [183]:
# create cleaned truth files for dictionary, only keep characters and space
with open(os.path.join(output_path, 'truth_cleaned.txt'), 'w') as output3:
    with open(os.path.join(output_path, 'truth_combined.txt'), 'r') as input3:
        for line in input3:
            out=clean(line)
            output3.write(out+' ')

In [184]:
# create bags of words by length
words_by_len = {}
with open(os.path.join(output_path, 'truth_cleaned.txt'), 'r') as input0:
    for line in input0:
        line = line.strip().split()
        if line:
            for word in line:
                if len(word) > 1:
                    words_by_len.setdefault(len(word), set()).add(word)

# create dictionary by length of words and bigram positions
dic_by_len = collections.defaultdict(dict)
for length in sorted(words_by_len.keys()):
    for i, j in itertools.combinations(range(length), 2):
        matrix = [[0]*26 for _ in range(26)]
        for word in words_by_len[length]:
            matrix[char_to_index(word[i])][char_to_index(word[j])] = 1
            dic_by_len[length][(i,j)] = matrix

# 3. Detect error based on the dictionary we create

**Rules**: (0-error;1-good)  
- If the word contains non-alphabetical characters, set it 0
- If the word contains only one characters, set it 1 if 'a' and 'i' and 0 otherwise
- Then set 1 or 0 in other cases according to the dictionary we create

In [185]:
def detect(filename):
    Detection_list=[]
    with open(filename, 'r') as file:
        for line in file:
            for word in line.strip().split():
                word = word.lower()
                tmp = [c for c in word if c in set('abcdefghijklmnopqrstuvwxyz')]
                if len(tmp) != len(word):
                    # contain non-alphabetical characters
                    Detection_list.append(0)
                elif len(word)==1:
                    if word == 'a' or word == 'i':
                        # set it 1 if 'a' and 'i'
                        Detection_list.append(1)
                    else:
                        # set it 0 otherwise
                        Detection_list.append(0)
                else:
                    # according to the dictionary
                    list1 = []
                    for i, j in itertools.combinations(range(len(word)),2):
                        key = (i, j)
                        matrix=dic_by_len[len(word)][key]
                        if matrix[char_to_index(word[i])][char_to_index(word[j])] == 1:
                            list1.append(1)
                        else:
                            list1.append(0)
                    if any(item == 0 for item in list1):
                        Detection_list.append(0)
                    else:
                        Detection_list.append(1)
    return Detection_list

In [186]:
# detect error for the whole orc texts
err = detect('../output/Detection/orc_combined.txt')

# 5. Calculate the error rate

In [187]:
1-sum(err)/len(err)

0.34465132160359924

# 6. Prepare for correction

In [188]:
# create truth file and orc file for correction
def clean2(line):
    line = line.lower()
    line = line.strip().split()
    res = []
    for word in line:
        if len(word) != len([c for c in word if c in set('abcdefghijklmnopqrstuvwxyz0123456789')]):
            continue
        else:
            tmp = ''
            for c in word:
                if c in set('abcdefghijklmnopqrstuvwxyz'):
                    tmp += c
                else:
                    tmp += list('abcdefghijklmnopqrstuvwxyz')[random.randint(0,25)]
            res.append(tmp)
    return ' '.join(res)

if not os.path.exists('../output/Correction/'):
    os.mkdir('../output/Correction/')

truth_file = open(os.path.join(output_path, 'truth_combined.txt'),'r')
orc_file = open(os.path.join(output_path, 'orc_combined.txt'),'r')
with open('../output/Correction/truth_corrected.txt', 'w') as out_truth_file:
    with open('../output/Correction/orc_corrected.txt', 'w') as out_orc_file:
        for line1,line2 in zip(truth_file, orc_file):
            line1 = clean2(line1)
            line2 = clean2(line2)
            if len(line1.strip().split()) == len(line2.strip().split()):
                out_truth_file.write(line1+' ')
                out_orc_file.write(line2+' ')
truth_file.close()
orc_file.close()

In [189]:
# detect error and output the matching
err_num = detect('../output/Correction/orc_corrected.txt')

In [190]:
list1, list2 = [], []
with open('../output/Correction/truth_corrected.txt') as file1:
    for line in file1:
        list1 += line.strip().split()

with open('../output/Correction/orc_corrected.txt') as file2:
    for line in file2:
        list2 += (line.strip().split())

def prev(num):
    res = []
    for i in range(len(list2)):
        if i - num < 0:
            res.append('*')
        else:
            res.append(list2[i-num])
    return res
def follow(num):
    res = []
    for i in range(len(list2)):
        if i+num >= len(list2):
            res.append('*')
        else:
            res.append(list2[i+num])
    return res

In [191]:
prev1, prev2, prev3, prev4 = prev(1),prev(2),prev(3),prev(4)
next1,next2,next3,next4 = follow(1),follow(2),follow(3),follow(4)

matching2 = pd.DataFrame({'prev2':prev2,
                          'prev1':prev1,
                          'WORD_ORC':list2,
                          'next1':next1,
                          'next2':next2,
                          'WORD_TRUE':list1,
                          'SAME':[bool(x) for x in err_num]})
matching4 = pd.DataFrame({'prev4':prev4,
                          'prev3':prev3,
                          'prev2':prev2,
                          'prev1':prev1,
                          'WORD_ORC':list2,
                          'next1':next1,
                          'next2':next2,
                          'next3':next3,
                          'next4':next4,
                          'WORD_TRUR':list1,
                          'SAME':[bool(x) for x in err_num]})

matching2.to_csv('../output/Correction/matching2.csv')
matching4.to_csv('../output/Correction/matching4.csv')