# 1. Detection

## 1.0 Importing libraries for detection

In [None]:
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import collections
from collections import defaultdict
from collections import Counter
from itertools import groupby
import nltk

## 1.1 Labelling tesseact

In [None]:

'''

Applied Data Science 
Project 4 - Detection

'''


#import feature as feature


def labelTesseract():
        

    
    truth_files_list = glob.glob('../data/ground_truth/*.txt') 
    test_files_list = glob.glob('../data/tesseract/*.txt')

    # only taking the ones that have the same number of lines in the file

    truth_files = []
    test_files = []
    file_counts = 0 # store number of files (test and truth have same length)

    for truth, test in zip(truth_files_list, test_files_list):
        truth_length = len(open(truth).readlines())    
        test_length = len(open(test).readlines())
        if truth_length == test_length:
            file_counts += 1
            truth_files.append(truth)
            test_files.append(test)


    # only taking lines that have the same number of words
    truth_words = []
    test_words = []
    truth_test_pair = [] # for correction
    actual_counts = 0 # actual counts of numbers of words after filtering
    for truth, test in zip(truth_files, test_files):
            
        with open(truth) as fd_truth:
            with open(test) as fd_test:
                for truth_line, test_line in zip(fd_truth, fd_test):
                    tmp_truth = truth_line.strip().split()
                    tmp_test = test_line.strip().split()
                    if len(tmp_truth) == len(tmp_test):
                        for truth_word, test_word in zip(tmp_truth, tmp_test):
                            actual_counts += 1
                            truth_words.append(truth_word)
                            test_words.append(test_word)
                            truth_test_pair.append((truth_word, test_word))
    # uncomment below for testing
    
    # print(actual_counts)
    print(len(truth_words))
    print(len(test_words))
    print(truth_words[:20])
    print(test_words[:20])
    
    '''
    # from the lists of words (truth, test) compare each of them
    # label 1 if test is the same as truth (correct)
    # label 0 if test is the different (wrong)

    label_dict = defaultdict(int)

    for truth, test in zip(truth_words, test_words):
        if truth == test:
            label_dict[test] = 1
        else:
            label_dict[test] = 0

    print(label_dict)
    '''

    # due to not being able to store duplicates, switching to list

    label = []
    for truth, test in zip(truth_words, test_words):
        if truth == test:
            label.append(1)
        else:
            label.append(0)
    
    # uncomment below for commenting
    
    print(label[:20])
    

    return (truth_test_pair, test_words, label)

## 1.2 Divide into test/train (by default 20%, 80%)

In [None]:
def div_train(pair, label, k = 0.2):

    # data = pd.DataFrame(words)
    # split up data into k / 1-k percentage -- by defauly 80% train 20% test
    train_data, test_data, train_label, test_label = train_test_split(pair, label, test_size = k)

    X_train = []
    X_test = []
    X_train_truth = []
    X_test_truth = []
    for data in train_data:
        X_train.append(data[1])
        X_train_truth.append(data[0])
    for data in test_data:
        X_test.append(data[1])
        X_test_truth.append(data[0])



    return (X_train, X_test, train_label, test_label, X_train_truth, X_test_truth)


## 1.3 Building Features

In [7]:
def buildFeatures(train_data, bigram_dict):
    # f1
    length = []
    
    # f2
    v_count = []
    c_count = []
    v_div_l = []
    c_div_l = []
    v_div_c = []
    
    # f3
    non_alnum = []
    non_alnum_div_l = []
    
    # f4
    digit = []
    digit_l = []

    # f5
    lower = []
    upper = []
    lower_div_l = []
    upper_div_l = []

    #f6
    three_consec_cons = []

    #f7
    alpha_num = []

    #f8
    six_consec_cons = []

    #f9
    infix = []

    #f10
    bigram = []

    #f11
    most_freq = []

    #f12
    non_div_alpha = []

    for word in train_data:
        length.append(f_1(word))
        
        v_count.append(f_2(word)[0])
        c_count.append(f_2(word)[1])
        v_div_l.append(f_2(word)[2])
        c_div_l.append(f_2(word)[3])
        v_div_c.append(f_2(word)[4])
        
        non_alnum.append(f_3(word)[0])
        non_alnum_div_l.append(f_3(word)[1])

        digit.append(f_4(word)[0])
        digit_l.append(f_4(word)[1])

        lower.append(f_5(word)[0])
        upper.append(f_5(word)[1])
        lower_div_l.append(f_5(word)[2])
        upper_div_l.append(f_5(word)[3])

        three_consec_cons.append(f_6(word))

        alpha_num.append(f_7(word))

        six_consec_cons.append(f_8(word))

        infix.append(f_9(word))

        # can change the scaling constant (third parameter)
        bigram.append(f_10(word, bigram_dict, 10000))

        most_freq.append(f_11(word))

        non_div_alpha.append(f_12(word))




    # create DataFrame

    df = pd.DataFrame({'length': length,
                       'num_vowels': v_count,
                       'num_conso': c_count,
                       'v_div_l': v_div_l,
                       'c_div_l': c_div_l,
                       'v_div_c': v_div_c,
                       'non_alnum': non_alnum,
                       'non_alnum_div_l': non_alnum_div_l,
                       'digit': digit,
                       'digit_l': digit_l,
                       'lower': lower,
                       'upper': upper,
                       'lower_div_l': lower_div_l,
                       'upper_div_l': upper_div_l,
                       'three_consec_cons': three_consec_cons,
                       'alpha_num': alpha_num,
                       'six_consec_cons': six_consec_cons,
                       'infix': infix,
                       'bigram': bigram,
                       'most_freq': most_freq,
                       'non_div_alpha': non_div_alpha})


    return df

def f_1(word):
    
    return len(word)

def f_2(word):
    l = len(word)
    vowels = 'aeiou'
    cons = 'bcdfghjklmnpqrstvwxyz'
    v_count = 0
    c_count = 0
    
    for c in word:
        if c in vowels:
            v_count += 1
        elif c in cons:
            c_count += 1


    if c_count == 0:
        return (v_count, c_count, v_count/l, c_count/l, 0)

    return (v_count, c_count, v_count/l, c_count/l, v_count/c_count)

def f_3(word):
    l = len(word)
    non_alnum = 0

    for c in word:
        if not c.isalnum():
            non_alnum += 1

    return (non_alnum, non_alnum/l)

def f_4(word):
    l = len(word)
    digit = 0

    for c in word:
        if c.isdigit():
            digit += 1
    return (digit, digit/l)

def f_5(word):
    l = len(word)
    upper = 0 
    lower = 0

    for c in word:
        if c.isupper():
            upper += 1
        elif c.islower():
            lower += 1 

    return (lower, upper, lower/l, upper/l)

def f_6(word):
    l = len(word)
    groups = groupby(word)
    result = [(label, sum(1 for _ in group)) for label, group in groups]

    max_count = float('-inf')
    for word_count in result:
        if word_count[1] > max_count:
            max_count = word_count[1]

    if max_count >= 3:
        return max_count/l
    else:
        return 0

def f_7(word):
    l = len(word)
    alnum = 0

    for c in word:
        if c.isalnum():
            alnum += 1
    
    non_alnum = l - alnum

    if non_alnum > alnum:
        return 1
    else:
        return 0

def f_8(word):
    cons = 'bcdfghjklmnpqrstvwxyz'
    consec_cons = 0
    max_count = 0

    for c in word:
        if c in cons:
            consec_cons += 1
        else:
            if max_count < consec_cons:
                max_count = consec_cons
            consec_cons = 0
    if max_count == 0:
        max_count = consec_cons

    if max_count >= 6:
        return 1
    else:
        return 0

def f_9(word):
    infix = word[1:-1]
    non_alnum = 0
    
    for c in infix:
        if not c.isalnum():
            non_alnum += 1
    if non_alnum >= 2:
        return 1
    else:
        return 0

def f_10(word, bigram_dict, c = 10000):

    word = word.lower()
    count = 0
    naturalness = 0
    for i in range(len(word)-1):
        count += 1.0
        naturalness += bigram_dict[(word[i], word[i+1])] / c

    if count == 0.0:
        return 0
    return naturalness / count

# return frequency of most frequent symbol
def f_11(word):
    l = len(word)
    most_freq = collections.Counter(word).most_common(1)[0][1]

    if most_freq >= 3:
        return most_freq/l
    else:
        return 0

def f_12(word):
    l = len(word)
    alpha = 0

    for c in word:
        if c.isalpha():
            alpha += 1

    non_alpha = l - alpha
    if alpha == 0:
        return 0
    
    return non_alpha / alpha

def compute_bigram():
    
    bigram_dict = defaultdict(int)
    truth_files_list = glob.glob('../data/ground_truth/*.txt')
    for file in truth_files_list:
        with open(file) as fd:
            for line in fd:
                each_line = line.strip().split()
                for word in each_line:
                    word = word.lower()
                    for i in range(len(word)-1):
                        bigram_dict[(word[i], word[i+1])] += 1

    return bigram_dict

## 1.4 Main SVM calling part

In [None]:
pair, words, label = labelTesseract()
train_data, test_data, train_label, test_label, ground_truth_train, ground_truth_test = div_train(pair, label)

# uncomment to test for truth, tesseract pair

print(train_data[:10])
print(ground_truth_train[:10])
print(train_label[:10])

print(test_data[:10])
print(ground_truth_test[:10])
print(test_label[:10])



bigram_dict = compute_bigram()
featureMatrix_train = buildFeatures(train_data, bigram_dict)
featureMatrix_test = buildFeatures(test_data, bigram_dict)

# uncomment for testing
'''
head = featureMatrix_train.head()
print(head.to_string())
'''

# build classifier
svm_class = SVC(kernel='rbf', verbose=True, gamma='scale')
svm_class.fit(featureMatrix_train, train_label)

# prediction
prediction = svm_class.predict(featureMatrix_test)

output = pd.DataFrame({'data': test_data,
                       'label': prediction})

print(output[:20])

##### evaluation
#confustion Matrix
from sklearn.metrics import classification_report, confusion_matrix

# print(confusion_matrix(test_label, prediction))
print(classification_report(test_label, prediction))

221504
221504
['communications', 'network.', 'Member', 'companies', 'are', 'strongly', 'encouraged', 'to', 'provide', 'this', 'needed', 'support.', 'The', 'state', 'advocacy', 'program*', 'including', 'the', 'new', 'CMA/LINC']
['communlcatlons', 'network.', 'Member', 'companles', 'are', 'strongly', 'encouraged', 'to', 'provlde', 'thls', 'needed', 'support.', 'The', 'state', 'advocacy', 'program"', '1nclud1ng', 'the', 'new', 'CMA/LINC']
[0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1]
['1', 'they', '15', 'to', 'that', 'Natlonal', 'Whltson', 'the', 'energyiconsumlng', 'from']
['12', 'they', 'is', 'to', 'that', 'National', 'Whitson', 'the', 'energy-consuming', 'from']
[0, 1, 0, 1, 1, 0, 0, 1, 0, 1]
['than', 'dutles', 'accord', "Victim's", 'of', '1n', 'the', 'for', 'by', '7']
['than', 'duties', 'accord', "Victim's", 'of', 'in', 'the', 'for', 'by', '-']
[1, 0, 1, 1, 1, 0, 1, 1, 1, 0]
[LibSVM]

### 1.1 Output OCR to detected_typo.csv

In [2]:
# output.to_csv('../output/detected_typo.csv')

# 2. Correction

### 2.1 Import detected typo

### 2.2 Clean detected typo (remove punctuation & number)

In [136]:
detected_typo_and_correct = pd.read_csv('../output/detected_typo.csv',index_col = 0)
# remove label column
detected_typo = detected_typo_and_correct[detected_typo_and_correct.label == 0].data
detected_typo_and_correct = detected_typo_and_correct.data
detected_typo.head(5)

0      $50,000.
1     1nclud1ng
3           29,
7          thls
10    polltlcal
Name: data, dtype: object

In [5]:
def remove_punct_num(series):
    result = series.replace(r'\d','')
    result = result.str.extract(r'([a-zA-Z]+)').dropna()[0]
    result = result.str.lower()
    
    return result

In [6]:
cleaned_typo = remove_punct_num(detected_typo)
cleaned_typo_and_correct = remove_punct_num(detected_typo_and_correct)
# detected_typo_and_correct

In [7]:
pair, words, label = labelTesseract()

true_typo = pd.DataFrame(pair)
true_typo.columns = ['correct','typo']
for col in true_typo.columns:
    true_typo[col] = remove_punct_num(true_typo[col])
true_typo = true_typo[true_typo['correct'] != true_typo['typo']].dropna().reset_index(drop = True)
true_typo.drop_duplicates(keep = 'first',inplace = True)
true_typo = true_typo[['typo','correct']].reset_index(drop = True)
# true_typo.set_index('typo',inplace = True)


### 2.3 Define N & V

In [8]:
import glob
import re
truth_counts = 0
training = []
# create a list of all .txt files
truth_files_list = glob.glob('../data/ground_truth/*.txt')
# reading the ground truth file
for file in truth_files_list:
    with open(file) as fd:
        for line in fd:
            each_line = re.findall(r"[\w']+",line)
            for word in each_line:
                training.append(word)
                truth_counts += 1
                
training = pd.Series(training)
training = training.str.replace(r'\d','').dropna()

training = training.str.lower()
training = training[training != '']
corpus = training.unique()

N = len(training)
V = len(corpus)

### 2.4 Find Candidates

We set edit distance to be 1 and find candidates from all ground truth articles. There are 4 different situations:

- Insertion
- Deletion
- Reversal
- Substitution

Because of the assumption in paper (there are only one typo in each word), we only find candidates that has one edit distance with the typo (there is a special case for reversal, since for the reversal case, edit distance is 2).

In [82]:
import numpy as np
import pandas as pd
from collections import Counter 
from nltk import edit_distance

def typo_classification(typo,correct):
    if (len(typo) > len(correct)):
        return 'insertion'
    elif (len(typo) < len(correct)):
        return 'deletion'
    else:
        typo_count = Counter(typo)
        correct_count = Counter(correct)
        if typo_count == correct_count:
            return 'reversal'
        else:
            return 'substitution'

def find_candidates(typo,corpus):
    candidates = []
    candi_type = []
    for word in corpus:
        ed = edit_distance(typo,word)
        word_type = typo_classification(typo,word)
#         if len(typo) > 4:
#             if ed in [1,2]:
#                 candidates.append(word)
#                 candi_type.append(word_type)
#         else:
        if ((ed == 1) |((ed == 2) & (word_type == 'reversal'))):
            candidates.append(word)
            candi_type.append(word_type)
    return candidates,candi_type

def find_position(typo,candidates):
    position = []
    for corr in candidates:
        typo_type = typo_classification(typo,corr)
        
        if (typo_type == 'deletion'):
            typo += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if corr[i] != corr[i-1]:
                        typo = typo[:-1]
                        position.append([typo,corr,"#",corr[i],i,typo_type])
                        break
                    else:
                        typo = typo[:-1]
                        position.append([typo,corr,"#",corr[i],i,typo_type])
                        position.append([typo,corr,"#",corr[i],i-1,typo_type])
                        break
                        
                i += 1
        elif (typo_type == 'insertion'):
            corr += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    
                    if typo[i] != typo[i-1]:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        break
                    elif ((typo[i] == typo[i-1])& (typo[i] == typo[i-2])):
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        position.append([typo,corr,typo[i],"#",i-1,typo_type])
                        position.append([typo,corr,typo[i],"#",i-2,typo_type])
                        break
                    else:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        position.append([typo,corr,typo[i],"#",i-1,typo_type])
                        break
                i += 1
        elif (typo_type == 'substitution'):
            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    position.append([typo,corr,typo[i],corr[i],i,typo_type])
                    break
                i+=1
                
        elif (typo_type == 'reversal'):
            i = 0
            while i < len(corr)-1:
                if ((typo[i] == corr[i+1]) & (typo[i+1] == corr[i])):
                    typo_comb = typo[i] + typo[i+1]
                    position.append([typo,corr,typo_comb,typo_comb[::-1],i,typo_type])
                    break
                i +=1
    return position

### 2.5 Import 4 confusion matrics

In [10]:
confusionsub=pd.read_csv('../data/confusion_matrix/sub_matrix.csv',index_col = 0)
confusionadd=pd.read_csv('../data/confusion_matrix/add_matrix.csv',index_col = 0)
confusiondel=pd.read_csv('../data/confusion_matrix/del_matrix.csv',index_col = 0)
confusionrev=pd.read_csv('../data/confusion_matrix/rev_matrix.csv',index_col = 0) 
# corpus = set(truth_clean)

### 2.6 Count bigram & 1gram & freq

Calculating word(character) frequency for ground truth paper for further use.

In [11]:
from collections import Counter
from nltk import ngrams

def bigram(string):
    x = []
    for i in range(len(string)):
        if i == len(string) - 1:
            return x
        else:
            x.append(string[i] + string[i+1])
            
def one_gram(string):
    return list(string)

def total_freq(training,types):
    if types == 'bigram':
        result = []
        for string in training:
            result += bigram(string)
        return Counter(result)
    elif types == 'onegram':
        result = []
        for string in training:
            result += one_gram(string)
        return Counter(result)
    elif types == 'freq':
        return Counter(training)
    
total_freq_bigram = total = total_freq(training,types = 'bigram')
total_freq_1gram = total = total_freq(training,types = 'onegram')
total_freq = total = total_freq(training,types = 'freq')

### 2.7 Calculate Likelihood

In [12]:
correction = pd.DataFrame()

def probabilityfunction(correction):
    for i in range(0,correction.shape[0]):
        typo = correction.iloc[i,0]
        index=correction.iloc[i,4]
        specificword=correction.iloc[i,1]
        if correction.iloc[i,5]=='insertion':
            if index != 0:

                #index=correction.iloc[i,4]
                X=specificword[index-1]
                Y=typo[index]
                add =confusionadd.loc[X,Y]
                total = total_freq_bigram[X+Y]
                    #lis.append(total)
                result =add/total
            if index == 0:
                X='#'
                Y=specificword[index]
                add =confusionadd.loc[X,Y]
                total=len(training)

                result=add/total

        if correction.iloc[i,5]=='deletion':
            if index != 0:

                #index=correction.iloc[i,4]
                X=specificword[index-1]
                Y=specificword[index]
                delt=confusiondel.loc[X,Y]
                
                total = total_freq_bigram[X+Y]
                    #lis.append(total)
                result=delt/total


            if index == 0:
                X='#'
                Y=specificword[index]
                delt=confusiondel.loc[X,Y]
                totall=len(training)

                result=delt/totall
        if correction.iloc[i,5]=='reversal':


                #index=correction.iloc[i,4]
                X=specificword[index]
                Y=specificword[index+1]
                rev=confusionrev.loc[X,Y]
                
                total = total_freq_bigram[X+Y]
                result=rev/total


        if correction.iloc[i,5]=='substitution':
            X=correction.iloc[i,2]
            Y=correction.iloc[i,3]
            sub = confusionsub.loc[X,Y]

            total = total_freq_1gram[Y]
                #lis.append(total)
            result=sub/total
            
        correction.loc[i,'probability of t given c'] = result


### 2.8 Calculate Posterior

In [13]:
def Correction(typos):
    from tqdm import tqdm_notebook

    output = []
    no_correction = 0
    no_correct_word = []

    for typo in tqdm_notebook(typos):
        try:
            candidates,cand_type = find_candidates(typo,corpus)
            correction = find_position(typo,candidates)
            correction = pd.DataFrame(correction)

            if correction.empty:  
                output.append(typo)
                no_correct_word.append(typo)
                no_correction += 1

            else:
                correction.columns = ['Typo','Correction','old','new','index','type']
                correction = correction[correction['index'] >= 0]

                if len(correction) == 1:
                    output.append(correction.loc[0,'Correction'])
                else:
                    # 1. calculate the prior

                    freq = [] # the number of times that the proposed correction c appears in the training set
                    for cor in correction['Correction']:
                        freq.append(total_freq[cor])    

                    N = len(training)
                    V = len(corpus)

                    prior = (pd.DataFrame(freq) + 0.5)/(N + V/2)

                    correction['probability of c'] = prior

                    probabilityfunction(correction)

                    # 3. Calculate the posterior and find the correction that has maximum posterior

                    correction['posterior'] = correction['probability of c'] * correction['probability of t given c']
                    best = correction[correction.posterior == correction.posterior.max()].Correction.values[0]
                    output.append(best)
        except:
            output.append(typo)
            no_correct_word.append(typo)
            no_correction += 1
    
    return (pd.Series(output)),no_correction,no_correct_word


### 2.9 Evaluation

In [138]:
precision_denominator = len(cleaned_typo)
recall_denominator = len(cleaned_typo_and_correct)

In [110]:
from nltk import ngrams

def vintersection(list1,list2,ngram = False):
    list1_dict = {}
    list2_dict = {}
    
    if ngram:
        list1 = list(''.join(list1))
        list2 = list(''.join(list2))

    for i in list1:
        list1_dict[i] = list1_dict.get(i,0) + 1

    for i in list2:
        list2_dict[i] = list2_dict.get(i,0) + 1
        
    result = {}
    for key in list1_dict.keys():
        if key in list2_dict.keys():
            value1 = list1_dict[key]
            value2 = list2_dict[key]
            min_value = min(value1,value2)
            result[key] = min_value
    return sum(result.values())

def precision(GT,OCR,ngram = False):
    TP = vintersection(GT,OCR,ngram)
    if ngram:
        OCR = list(''.join(OCR))
    return TP/precision_denominator

def recall(GT,OCR,ngram = False):
    TP = vintersection(GT,OCR)
    if ngram:
        GT = list(''.join(GT))
    return TP/recall_denominator

# 3.1 Case 1: Correct all typos

### Correction

In [96]:
typos = true_typo['typo']
correct = true_typo['correct']

Correction_output_all,no_correction_num,no_correct_word = Correction(typos)

HBox(children=(IntProgress(value=0, max=12111), HTML(value='')))






In [97]:
print('Accuracy: {:.2%}'.format(vintersection(Correction_output_all,correct)/len(Correction_output_all)))

Accuracy: 48.20%


In [98]:
print('No corrected rate: {:.1%}'.format(no_correction_num/len(Correction_output_all)))

No corrected rate: 39.3%


### Output correction file

In [99]:
Correction_output_all.to_csv('../output/Correction_output_all.csv')

### Calculate Recall & precision 

In [74]:
recall(Correction_output,correct[:100])

0.63

In [131]:
pair

[('communications', 'communlcatlons'),
 ('network.', 'network.'),
 ('Member', 'Member'),
 ('companies', 'companles'),
 ('are', 'are'),
 ('strongly', 'strongly'),
 ('encouraged', 'encouraged'),
 ('to', 'to'),
 ('provide', 'provlde'),
 ('this', 'thls'),
 ('needed', 'needed'),
 ('support.', 'support.'),
 ('The', 'The'),
 ('state', 'state'),
 ('advocacy', 'advocacy'),
 ('program*', 'program"'),
 ('including', '1nclud1ng'),
 ('the', 'the'),
 ('new', 'new'),
 ('CMA/LINC', 'CMA/LINC'),
 ('computer', 'computer'),
 ('network,', 'network.'),
 ('will', 'will'),
 ('be', 'be'),
 ('heavily', 'heavlly'),
 ('involved', '1nvolved'),
 ('in', '1n'),
 ('1986', '1995'),
 ('in', '1n'),
 ('the', 'the'),
 ('critical', 'crltlcal'),
 ('environmental', 'envlronmental'),
 ('issues', 'lssues'),
 ('identified', '1dent1£1ed'),
 ('by', 'by'),
 ('the', 'the'),
 ('National', 'Natlonal'),
 ('Conference', 'Conference'),
 ('of', 'of'),
 ('State', 'State'),
 ('Legislators,', 'Leglslators,'),
 ('namely,', 'namely,'),
 ('gro

# Case 2: Only consider edit distance = 1 case

In [129]:
# cleaned_typo.map(true_typo_dict)
# cleaned_typo
ed_1_typo = []
from nltk import edit_distance
for i in range(len(true_typo)):
    typo = true_typo.loc[i,'typo']
    correct = true_typo.loc[i,'correct']
    if edit_distance(typo,correct) == 1:
        ed_1_typo.append([typo,correct])
                         
ed_1_typo_df = pd.DataFrame(ed_1_typo)
ed_1_typo_df.columns = ['typo','correct']

In [130]:
true_typo

Unnamed: 0,typo,correct
0,communlcatlons,communications
1,companles,companies
2,provlde,provide
3,thls,this
4,nclud,including
5,heavlly,heavily
6,nvolved,involved
7,n,in
8,crltlcal,critical
9,envlronmental,environmental


### Correction

In [114]:
typos = ed_1_typo_df['typo']
correct = ed_1_typo_df['correct']

Correction_output,no_correction_num,no_correct_word = Correction(typos)

HBox(children=(IntProgress(value=0, max=5901), HTML(value='')))




In [115]:
print('No corrected rate: {:.1%}'.format(no_correction_num/len(Correction_output)))

No corrected rate: 4.5%


In [116]:
print('Accuracy: {:.2%}'.format(vintersection(Correction_output,correct)/len(Correction_output)))

Accuracy: 86.44%


### Output Correction

In [120]:
Correction_output.to_csv('../output/Correction_output_ed_1.csv')

### Calculate Recall & precision 

In [121]:
precision(Correction_output,correct)

0.3995770014099953

In [123]:
recall(Correction_output,correct)

0.12075087586402802

In [126]:
len(correct)

5901

In [128]:
Correction_output

0                companies
1                  provide
2                     this
3                  heavily
4                 involved
5                        n
6            environmental
7                   issues
8                 national
9              legislators
10                disposal
11                    bill
12                continue
13               suparfund
14                   which
15                detailed
16                     cma
17                  action
18               continues
19                   toxic
20              prevention
21                requires
22               reporting
23              concerning
24                industry
25              operations
26                    risk
27               reduction
28               cessation
29                   issue
               ...        
5871            background
5872                     u
5873                    no
5874                result
5875               patents
5876              together
5