# 1. Detection

In [1]:
import pandas as pd
import numpy as np
from detection import *
from collections import Counter


In [None]:
pair, words, label = labelTesseract()
train_data, test_data, train_label, test_label, ground_truth_train, ground_truth_test = div_train(pair, label)

# uncomment to test for truth, tesseract pair
'''
print(train_data[:10])
print(ground_truth_train[:10])
print(train_label[:10])

print(test_data[:10])
print(ground_truth_test[:10])
print(test_label[:10])
'''


bigram_dict = compute_bigram()
featureMatrix_train = buildFeatures(train_data, bigram_dict)
featureMatrix_test = buildFeatures(test_data, bigram_dict)

# uncomment for testing
'''
head = featureMatrix_train.head()
print(head.to_string())
'''

# build classifier
svm_class = SVC(kernel='rbf', verbose=True, gamma='scale')
svm_class.fit(featureMatrix_train, train_label)

# prediction
prediction = svm_class.predict(featureMatrix_test)

output = pd.DataFrame({'data': test_data,
                       'label': prediction})

print(output[:20])

##### evaluation
#confustion Matrix
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(test_label, prediction))
print(classification_report(test_label, prediction))

##### 1.1 Output OCR to detected_typo.csv

In [2]:
# output.to_csv('../output/detected_typo.csv')

# 2. Correction

##### 2.1 Import detected typo

In [3]:
detected_typo = pd.read_csv('../output/detected_typo.csv',index_col = 0)
detected_typo.head(20)

Unnamed: 0,data,label
0,"$50,000.",0
1,1nclud1ng,0
2,members,1
3,29,0
4,can,1
5,process,1
6,DDT,1
7,thls,0
8,new,1
9,another,1


##### 2.2 Clean detected typo (remove punctuation & number)

In [4]:
detected_typo_and_correct = pd.read_csv('../output/detected_typo.csv',index_col = 0)
# remove label column
detected_typo = detected_typo_and_correct[detected_typo_and_correct.label == 0].data
detected_typo_and_correct = detected_typo_and_correct.data
detected_typo.head(5)

0      $50,000.
1     1nclud1ng
3           29,
7          thls
10    polltlcal
Name: data, dtype: object

In [5]:
def remove_punct_num(series):
    result = series.replace(r'\d','')
    result = result.str.extract(r'([a-zA-Z]+)').dropna()[0]
    result = result.str.lower()
    
    return result

In [6]:
cleaned_typo = remove_punct_num(detected_typo)
cleaned_typo_and_correct = remove_punct_num(detected_typo_and_correct)
# detected_typo_and_correct

In [7]:
pair, words, label = labelTesseract()

true_typo = pd.DataFrame(pair)
true_typo.columns = ['correct','typo']
for col in true_typo.columns:
    true_typo[col] = remove_punct_num(true_typo[col])
true_typo = true_typo[true_typo['correct'] != true_typo['typo']].dropna().reset_index(drop = True)
true_typo.drop_duplicates(keep = 'first',inplace = True)
true_typo = true_typo[['typo','correct']].reset_index(drop = True)
# true_typo.set_index('typo',inplace = True)


### 2.3 Define N & V

In [8]:
import glob
import re
truth_counts = 0
training = []
# create a list of all .txt files
truth_files_list = glob.glob('../data/ground_truth/*.txt')
# reading the ground truth file
for file in truth_files_list:
    with open(file) as fd:
        for line in fd:
            each_line = re.findall(r"[\w']+",line)
            for word in each_line:
                training.append(word)
                truth_counts += 1
                
training = pd.Series(training)
training = training.str.replace(r'\d','').dropna()

training = training.str.lower()
training = training[training != '']
corpus = training.unique()

N = len(training)
V = len(corpus)

### 2.4 Find Candidates

In [82]:
import numpy as np
import pandas as pd
from collections import Counter 
from nltk import edit_distance

def typo_classification(typo,correct):
    if (len(typo) > len(correct)):
        return 'insertion'
    elif (len(typo) < len(correct)):
        return 'deletion'
    else:
        typo_count = Counter(typo)
        correct_count = Counter(correct)
        if typo_count == correct_count:
            return 'reversal'
        else:
            return 'substitution'

def find_candidates(typo,corpus):
    candidates = []
    candi_type = []
    for word in corpus:
        ed = edit_distance(typo,word)
        word_type = typo_classification(typo,word)
#         if len(typo) > 4:
#             if ed in [1,2]:
#                 candidates.append(word)
#                 candi_type.append(word_type)
#         else:
        if ((ed == 1) |((ed == 2) & (word_type == 'reversal'))):
            candidates.append(word)
            candi_type.append(word_type)
    return candidates,candi_type

def find_position(typo,candidates):
    position = []
    for corr in candidates:
        typo_type = typo_classification(typo,corr)
        
        if (typo_type == 'deletion'):
            typo += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if corr[i] != corr[i-1]:
                        typo = typo[:-1]
                        position.append([typo,corr,"#",corr[i],i,typo_type])
                        break
                    else:
                        typo = typo[:-1]
                        position.append([typo,corr,"#",corr[i],i,typo_type])
                        position.append([typo,corr,"#",corr[i],i-1,typo_type])
                        break
                        
                i += 1
        elif (typo_type == 'insertion'):
            corr += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    
                    if typo[i] != typo[i-1]:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        break
                    elif ((typo[i] == typo[i-1])& (typo[i] == typo[i-2])):
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        position.append([typo,corr,typo[i],"#",i-1,typo_type])
                        position.append([typo,corr,typo[i],"#",i-2,typo_type])
                        break
                    else:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        position.append([typo,corr,typo[i],"#",i-1,typo_type])
                        break
                i += 1
        elif (typo_type == 'substitution'):
            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    position.append([typo,corr,typo[i],corr[i],i,typo_type])
                    break
                i+=1
                
        elif (typo_type == 'reversal'):
            i = 0
            while i < len(corr)-1:
                if ((typo[i] == corr[i+1]) & (typo[i+1] == corr[i])):
                    typo_comb = typo[i] + typo[i+1]
                    position.append([typo,corr,typo_comb,typo_comb[::-1],i,typo_type])
                    break
                i +=1
    return position

### 2.5 Import 4 confusion matrics

In [10]:
confusionsub=pd.read_csv('../data/confusion_matrix/sub_matrix.csv',index_col = 0)
confusionadd=pd.read_csv('../data/confusion_matrix/add_matrix.csv',index_col = 0)
confusiondel=pd.read_csv('../data/confusion_matrix/del_matrix.csv',index_col = 0)
confusionrev=pd.read_csv('../data/confusion_matrix/rev_matrix.csv',index_col = 0) 
# corpus = set(truth_clean)

### 2.6 Count bigram & 1gram & freq

In [11]:
from collections import Counter
from nltk import ngrams

def bigram(string):
    x = []
    for i in range(len(string)):
        if i == len(string) - 1:
            return x
        else:
            x.append(string[i] + string[i+1])
            
def one_gram(string):
    return list(string)

def total_freq(training,types):
    if types == 'bigram':
        result = []
        for string in training:
            result += bigram(string)
        return Counter(result)
    elif types == 'onegram':
        result = []
        for string in training:
            result += one_gram(string)
        return Counter(result)
    elif types == 'freq':
        return Counter(training)
    
total_freq_bigram = total = total_freq(training,types = 'bigram')
total_freq_1gram = total = total_freq(training,types = 'onegram')
total_freq = total = total_freq(training,types = 'freq')

### 2.7 Calculate Likelihood

In [12]:
correction = pd.DataFrame()

def probabilityfunction(correction):
    for i in range(0,correction.shape[0]):
        typo = correction.iloc[i,0]
        index=correction.iloc[i,4]
        specificword=correction.iloc[i,1]
        if correction.iloc[i,5]=='insertion':
            if index != 0:

                #index=correction.iloc[i,4]
                X=specificword[index-1]
                Y=typo[index]
                add =confusionadd.loc[X,Y]
                total = total_freq_bigram[X+Y]
                    #lis.append(total)
                result =add/total
            if index == 0:
                X='#'
                Y=specificword[index]
                add =confusionadd.loc[X,Y]
                total=len(training)

                result=add/total

        if correction.iloc[i,5]=='deletion':
            if index != 0:

                #index=correction.iloc[i,4]
                X=specificword[index-1]
                Y=specificword[index]
                delt=confusiondel.loc[X,Y]
                
                total = total_freq_bigram[X+Y]
                    #lis.append(total)
                result=delt/total


            if index == 0:
                X='#'
                Y=specificword[index]
                delt=confusiondel.loc[X,Y]
                totall=len(training)

                result=delt/totall
        if correction.iloc[i,5]=='reversal':


                #index=correction.iloc[i,4]
                X=specificword[index]
                Y=specificword[index+1]
                rev=confusionrev.loc[X,Y]
                
                total = total_freq_bigram[X+Y]
                result=rev/total


        if correction.iloc[i,5]=='substitution':
            X=correction.iloc[i,2]
            Y=correction.iloc[i,3]
            sub = confusionsub.loc[X,Y]

            total = total_freq_1gram[Y]
                #lis.append(total)
            result=sub/total
            
        correction.loc[i,'probability of t given c'] = result


### 2.8 Calculate Posterior

In [13]:
def Correction(typos):
    from tqdm import tqdm_notebook

    output = []
    no_correction = 0
    no_correct_word = []

    for typo in tqdm_notebook(typos):
        try:
            candidates,cand_type = find_candidates(typo,corpus)
            correction = find_position(typo,candidates)
            correction = pd.DataFrame(correction)

            if correction.empty:  
                output.append(typo)
                no_correct_word.append(typo)
                no_correction += 1

            else:
                correction.columns = ['Typo','Correction','old','new','index','type']
                correction = correction[correction['index'] >= 0]

                if len(correction) == 1:
                    output.append(correction.loc[0,'Correction'])
                else:
                    # 1. calculate the prior

                    freq = [] # the number of times that the proposed correction c appears in the training set
                    for cor in correction['Correction']:
                        freq.append(total_freq[cor])    

                    N = len(training)
                    V = len(corpus)

                    prior = (pd.DataFrame(freq) + 0.5)/(N + V/2)

                    correction['probability of c'] = prior

                    probabilityfunction(correction)

                    # 3. Calculate the posterior and find the correction that has maximum posterior

                    correction['posterior'] = correction['probability of c'] * correction['probability of t given c']
                    best = correction[correction.posterior == correction.posterior.max()].Correction.values[0]
                    output.append(best)
        except:
#             print(typo)
            output.append(typo)
            no_correct_word.append(typo)
            no_correction += 1
    #         break
    
    return (pd.Series(output)),no_correction,no_correct_word


### 2.9 Evaluation

In [65]:
precision_denominator = len(cleaned_typo)
recall_denominator = len(cleaned_typo_and_correct)

In [47]:
from nltk import ngrams

def vintersection(list1,list2,ngram = False):
    list1_dict = {}
    list2_dict = {}
    
    if ngram:
        list1 = list(''.join(list1))
        list2 = list(''.join(list2))

    for i in list1:
        list1_dict[i] = list1_dict.get(i,0) + 1

    for i in list2:
        list2_dict[i] = list2_dict.get(i,0) + 1
        
    result = {}
    for key in list1_dict.keys():
        if key in list2_dict.keys():
            value1 = list1_dict[key]
            value2 = list2_dict[key]
            min_value = min(value1,value2)
            result[key] = min_value
    return sum(result.values())

def precision(GT,OCR,ngram = False):
    TP = vintersection(GT,OCR,ngram)
    if ngram:
        OCR = list(''.join(OCR))
    return TP/len(OCR)

def recall(GT,OCR,ngram = False):
    TP = vintersection(GT,OCR)
    if ngram:
        GT = list(''.join(GT))
    return TP/len(GT)

# 3.1 Case 1: Correct all typos

### Correction

In [None]:
typos = true_typo['typo']
correct = true_typo['correct']

Correction_output,no_correction_num,no_correct_word = Correction(typos)

HBox(children=(IntProgress(value=0, max=12111), HTML(value='')))

In [None]:
print('Accuracy: {:.2%}'.format(vintersection(Correction_output,correct)/len(Correction_output)))

In [None]:
print('No corrected rate: {:.1%}'.format(no_correction_num/len(Correction_output)))

### Output correction file

In [None]:
Correction_output.to_csv('Correction_output_all.csv')

### Calculate Recall & precision 

In [74]:
recall(Correction_output,correct[:100])

0.63

In [75]:
Correction_output

0     communlcations
1          companies
2            provide
3               this
4             includ
5            heavily
6           involved
7                  n
8           crltlcal
9      environmental
10            issues
11              sent
12          national
13       legislators
14          disposal
15              bill
16          continue
17                 m
18         suparfund
19             which
20          detailed
21               cma
22     rlghtitoiknow
23            action
24         continues
25             toxic
26        prevention
27          requires
28         reporting
29        nformatlon
           ...      
70                mm
71          analysis
72         headlines
73              june
74          reauthor
75            gained
76         prlaarlly
77          ntruslon
78               nto
79    reconclllatlon
80              from
81         edltorlal
82             might
83       antlclpated
84         following
85          accident
86           

# Case 2: Only consider edit distance = 1 case

In [86]:
# cleaned_typo.map(true_typo_dict)
# cleaned_typo
ed_1_typo = []
from nltk import edit_distance
for i in range(len(true_typo)):
    typo = true_typo.loc[i,'typo']
    correct = true_typo.loc[i,'correct']
    if edit_distance(typo,correct) == 1:
        ed_1_typo.append([typo,correct])
                         
ed_1_typo_df = pd.DataFrame(ed_1_typo)
ed_1_typo_df.columns = ['typo','correct']

### Correction

In [91]:
typos = ed_1_typo_df['typo']
correct = ed_1_typo_df['correct']

Correction_output,no_correction_num,no_correct_word = Correction(typos[:200])

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [94]:
print('No corrected rate: {:.1%}'.format(no_correction_num/len(Correction_output)))

No corrected rate: 2.5%


In [95]:
print('Accuracy: {:.2%}'.format(vintersection(Correction_output,correct[:200])/len(Correction_output)))

Accuracy: 91.00%


### Output Correction

In [None]:
Correction_output.to_csv('Correction_output_ed_1.csv')

### Calculate Recall & precision 

In [58]:
vintersection(Correction_output,correct)/

92