In [204]:
#Import packages
import collections
import itertools
import os
import shutil
import glob
import csv
import pandas as pd

## 1. Create dictionaries by word length

In [205]:
#Clean the data(Delete punctuations and irrelevent characteristics)
def clean(words):
    out = []
    for c in words:
        c = c.lower()
        if c in set('abcdefghijklmnopqrstuvwxyz '):
            out.append(c)
    return ''.join(out)

#Convert character to int
def char_to_index(x):
    return ord(x)-ord('a')

#Combine all train_set files
read_files = glob.glob(os.path.join(os.getcwd(), 'train_set', "*.txt"))
with open("train_combine.txt", "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())

#Clean the train_combine.txt file
with open('train_combine.txt', encoding="utf8") as file:
    with open('train_clean.txt',"w") as file_clean:
        for ws in file:
            words=clean(ws)
            file_clean.write(words)
        file_clean.close()

#Creat empty list and dictionary
group_by_len = collections.defaultdict(list)
digrams_by_len = collections.defaultdict(dict)

#Group all words in tranning set by length
with open('train_clean.txt', encoding="utf8") as file:
    for line in file:
        for w in line.split():
            group_by_len[len(w)].append(w)
            
#Create dictionaries by word length
for length in group_by_len:
        for i, j in itertools.combinations(range(length), 2):
                key = (i, j)
                matrix = [[0] * 26 for _ in range(26)]
                for words in group_by_len[length]:
                    matrix[char_to_index(words[i])][char_to_index(words[j])] = 1
                    digrams_by_len[length][key] = matrix

## 2. Create the error detection list for test set based the dictionaryies created before
#### * In the list, 0- the word is wrong, 1- the word is right

In [215]:
#Combine all test_set files
read_files = glob.glob(os.path.join(os.getcwd(), 'test_set', "*.txt"))
with open("test_combine.txt", "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())
            
##Clean the test_combine.txt file
with open('test_combine.txt', encoding="utf8") as file:
    with open('test_clean.txt',"w") as file_clean:
        for ws in file:
            words=clean(ws)
            file_clean.write(words)
        file_clean.close()
        
#Create the error detection list for test set based the dictionaryies created before
Detection_list=[]
with open('test_clean.txt', encoding="utf8") as file:
    for line in file:
        for words in line.split():
            list1=[]
            if len(words)==1:
                if words == "a" or words=="i":
                    Detection_list.append(1)
                else:
                    Detection_list.append(0)
            else:
                for i, j in itertools.combinations(range(len(words)),2):
                    key = (i, j) 
                    matrix=digrams_by_len[len(words)][key]
                    if matrix[char_to_index(words[i])][char_to_index(words[j])] == 1:
                        list1.append(1)
                    else:
                        list1.append(0)
                if any(item == 0 for item in list1):
                    Detection_list.append(0)
                else:
                    Detection_list.append(1)
print("First 100 numbers in detection list:",Detection_list[0:100])

First 100 numbers of detection list: [0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1]


## 3. Construct the Confusion matrix for the detection method

In [223]:
#Combine all test_set_truth files
read_files = glob.glob(os.path.join(os.getcwd(), 'test_set_truth', "*.txt"))
with open("test_clean_truth.txt", "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())
            
#Convert test_clean.txt and test_clean_truth.txt to list files
wordlist=[]
with open('test_clean.txt', encoding="utf8") as file:
    for line in file:
        for words in line.split():
            wordlist.append(words)   
file_truth=[]          
with open('test_clean_truth.txt', encoding="utf8") as file1: 
    for line in file1:
        for words in line.split():
            file_truth.append(words)
            
#loop through numbers in detection_list and corresponding word in test_file to find false_detection or failed_detection rate       
detection1=[]
detection0=[]
for i in range(len(Detection_list)):
    if Detection_list[i]==1:
        if wordlist[i] in file_truth:
            detection1.append(1)
        else:
            detection1.append(0)
    else:
        if wordlist[i] in file_truth:
            detection0.append(0)
        else:
            detection0.append(1)
                
error_rate0=(1-sum(detection0)/len(detection0))*100 #according to the method, the word is wrong, but actually it's right
error_rate1=(1-sum(detection1)/len(detection1))*100 #according to the method, the word is right, but actually it's wrong
print("Error_rate0:",error_rate0,"% (note:according to the method, the word is wrong, but actually it's right)")
print("Error_rate1:",error_rate1,"% (note:according to the method, the word is right, but actually it's wrong)")

10123
Error_rate0: 11.777150916784207 % (note:according to the method, the word is wrong, but actually it's right)
Error_rate1: 27.363798545354744 % (note:according to the method, the word is right, but actually it's wrong)


## 4. Create one to one matching dataframe for Error word and Ground truth(Code in this chunk is made for the correction part)

In [225]:
list_a=[]
list_b=[] 
combine=pd.DataFrame()
for path1,path2 in zip(glob.glob(os.path.join(os.getcwd(),'test_set','*.txt')),glob.glob(os.path.join(os.getcwd(),'test_truth','*.txt'))):
    with open(path1, encoding="utf8") as file1:
        with open(path2, encoding="utf8") as file2:
            for line1,line2 in zip(file1,file2):
                line3=clean(line1)
                line4=clean(line2)
                if len(line3)==len(line4):
                    for word1,word2 in zip(line3.split(),line4.split()): 
                        if len(word1)!=1:
                            for i, j in itertools.combinations(range(len(word1)),2):
                                key = (i, j)
                                list1=[]
                                matrix=digrams_by_len[len(word1)][key]
                                if matrix[char_to_index(word1[i])][char_to_index(word1[j])] == 1:
                                    list1.append(1)
                                else:
                                    list1.append(0)
                                if any(item == 0 for item in list1):
                                    word_out= word1
                                else:
                                    word_out="Goooood"
                        list_a.append(word_out)
                        list_b.append(word2)
    matching = pd.DataFrame({"Error":list_a,"Truth":list_b})
    matching = matching.drop(matching[matching.Error=="Goooood"].index)
    matching = matching[matching.Error != matching.Truth]
combine = pd.concat([combine,matching])
#write code to excel
writer = pd.ExcelWriter('Error_truth_total.xlsx')
combine.to_excel(writer,'Sheet1')
writer.save()