## Detection

In [1]:
import pandas as pd
import numpy as np

In [None]:
from detection import *

pair, words, label = labelTesseract()
train_data, test_data, train_label, test_label, ground_truth_train, ground_truth_test = div_train(pair, label)

# uncomment to test for truth, tesseract pair
'''
print(train_data[:10])
print(ground_truth_train[:10])
print(train_label[:10])

print(test_data[:10])
print(ground_truth_test[:10])
print(test_label[:10])
'''


bigram_dict = compute_bigram()
featureMatrix_train = buildFeatures(train_data, bigram_dict)
featureMatrix_test = buildFeatures(test_data, bigram_dict)

# uncomment for testing
'''
head = featureMatrix_train.head()
print(head.to_string())
'''

# build classifier
svm_class = SVC(kernel='rbf', verbose=True, gamma='scale')
svm_class.fit(featureMatrix_train, train_label)

# prediction
prediction = svm_class.predict(featureMatrix_test)

output = pd.DataFrame({'data': test_data,
                       'label': prediction})

print(output[:20])

##### evaluation
#confustion Matrix
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(test_label, prediction))
print(classification_report(test_label, prediction))

[LibSVM]

## Correction

In [None]:
typos = output[output.label == 0].reset_index(drop = True).data

import string
# remove puncutation and numbers
cleaned_typos = cleaned_typos.str.extract(r'([a-zA-Z]+)').dropna()[0]
cleaned_typos.reset_index(drop = True,inplace = True)
cleaned_typos = cleaned_typos.str.lower()

In [None]:
cleaned_typos.to_csv('cleaned_typos.csv')

### Dictionary


In [2]:
import glob
truth_counts = 0
corpus = []
# create a list of all .txt files
truth_files_list = glob.glob('../data/ground_truth/*.txt')
# reading the ground truth file
for file in truth_files_list:
    with open(file) as fd:
        for line in fd:
            each_line = line.strip().split()
            for word in each_line:
                corpus.append(word)
                truth_counts += 1
corpus = pd.Series(corpus)
corpus = corpus.str.extract(r'([a-zA-Z]+)').dropna()[0]
corpus = corpus.str.lower().unique()

truth_clean = corpus

In [3]:
truth_counts = 0
teseract_clean = []
# create a list of all .txt files
truth_files_list = glob.glob('../data/tesseract/*.txt')
# reading the ground truth file
for file in truth_files_list:
    with open(file) as fd:
        for line in fd:
            each_line = line.strip().split()
            for word in each_line:
                teseract_clean.append(word)
                truth_counts += 1
teseract_clean = pd.Series(teseract_clean)
teseract_clean = teseract_clean.str.extract(r'([a-zA-Z]+)').dropna()[0]
teseract_clean = teseract_clean.str.lower().unique()

### Find Candidates

In [46]:
import numpy as np
import pandas as pd
from collections import Counter 
from nltk import edit_distance

def typo_classification(typo,correct):
    if (len(typo) > len(correct)):
        return 'insertion'
    elif (len(typo) < len(correct)):
        return 'deletion'
    else:
        typo_count = Counter(typo)
        correct_count = Counter(correct)
        if typo_count == correct_count:
            return 'reversal'
        else:
            return 'substitution'

def find_candidates(typo,corpus):
    candidates = []
    candi_type = []
    for word in corpus:
        ed = edit_distance(typo,word)
        word_type = typo_classification(typo,word)
        if len(typo) > 5:
            if ed in [1,2,3]:
                candidates.append(word)
                candi_type.append(word_type)
        else:
            if ((ed == 1) |((ed == 2) & (word_type == 'reversal'))):
                candidates.append(word)
                candi_type.append(word_type)
    return candidates,candi_type

def find_position(typo,candidates):
    position = []
    for corr in candidates:
        typo_type = typo_classification(typo,corr)
        
        if (typo_type == 'deletion'):
            typo += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if corr[i] != corr[i-1]:
                        typo = typo[:-1]
                        position.append([typo,corr,"#",corr[i],i,typo_type])
                        break
                    else:
                        typo = typo[:-1]
                        position.append([typo,corr,"#",corr[i],i,typo_type])
                        position.append([typo,corr,"#",corr[i],i-1,typo_type])
                        break
                        
                i += 1
        elif (typo_type == 'insertion'):
            corr += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if typo[i] != typo[i-1]:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        break
                    elif ((typo[i] == typo[i-1])& (typo[i] == typo[i-2])):
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        position.append([typo,corr,typo[i],"#",i-1,typo_type])
                        position.append([typo,corr,typo[i],"#",i-2,typo_type])
                        break
                    else:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"#",i,typo_type])
                        position.append([typo,corr,typo[i],"#",i-1,typo_type])
                        break
                i += 1
        elif (typo_type == 'substitution'):
            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    position.append([typo,corr,typo[i],corr[i],i,typo_type])
                    break
                i+=1
                
        elif (typo_type == 'reversal'):
            i = 0
            while i < len(corr)-1:
                if ((typo[i] == corr[i+1]) & (typo[i+1] == corr[i])):
                    typo_comb = typo[i] + typo[i+1]
                    position.append([typo,corr,typo_comb,typo_comb[::-1],i,typo_type])
                    break
                i +=1
    return position

In [5]:
confusionsub=pd.read_csv('../data/confusion_matrix/sub_matrix.csv',index_col = 0)
confusionadd=pd.read_csv('../data/confusion_matrix/add_matrix.csv',index_col = 0)
confusiondel=pd.read_csv('../data/confusion_matrix/del_matrix.csv',index_col = 0)
confusionrev=pd.read_csv('../data/confusion_matrix/rev_matrix.csv',index_col = 0) 
corpus = set(truth_clean)

In [50]:
correction = pd.DataFrame()

def probabilityfunction(correction):
    for i in range(0,correction.shape[0]):
        typo = correction.iloc[i,0]
        index=correction.iloc[i,4]
        specificword=correction.iloc[i,1]
        if correction.iloc[i,5]=='insertion':
            if index != 0:

                #index=correction.iloc[i,4]
                X=specificword[index-1]
                Y=typo[index]
                add =confusionadd.loc[X,Y]
                total=0
                for z in range(0,len(truth_clean)):
                    if X == '#':
                        total += truth_clean[z].startswith(Y)
                    else:
                        total += truth_clean[z].count(X+Y)
                    #lis.append(total)
                correction.iloc[i,6]=add/total
            if index == 0:
                X='#'
                Y=specificword[index]
                add =confusionadd.loc[X,Y]
                totall=len(truth_clean)

                correction.iloc[i,6]=add/totall

        if correction.iloc[i,5]=='deletion':
            if index != 0:

                #index=correction.iloc[i,4]
                X=specificword[index-1]
                Y=specificword[index]
                delt=confusiondel.loc[X,Y]
                total=0
                for z in range(0,len(truth_clean)):
                    total += truth_clean[z].count(X+Y)
                    #lis.append(total)
                correction.iloc[i,6]=delt/total


            if index == 0:
                X='#'
                Y=specificword[index]
                delt=confusiondel.loc[X,Y]
                totall=len(truth_clean)

                correction.iloc[i,6]=delt/totall
        if correction.iloc[i,5]=='reversal':


                #index=correction.iloc[i,4]
                X=specificword[index]
                Y=specificword[index+1]
                rev=confusionrev.loc[X,Y]
                total=0
                for z in range(0,len(truth_clean)):
                    total += truth_clean[z].count(X+Y)
                    #lis.append(total)
                correction.iloc[i,6]=rev/total


        if correction.iloc[i,5]=='substitution':
            X=correction.iloc[i,2]
            Y=correction.iloc[i,3]
            sub = confusionsub.loc[X,Y]
            #lis.append(sub)

            total=0
            for z in range(0,len(truth_clean)):
                
                total += truth_clean[z].count(Y)
                #lis.append(total)
            correction.iloc[i,6]=sub/total
        correction['probability of t given c'] = correction.iloc[i,6]


In [20]:
typos =  pd.read_csv('cleaned_typos.csv') # Detection result(in list) 
typos = list(typos['n'])
typos = [x.lower() for x in typos]
wrong = pd.DataFrame(typos)


In [None]:
from tqdm import tqdm_notebook

output = []
no_correction = 0

for typo in tqdm_notebook(typos):
    try:
        candidates,cand_type = find_candidates(typo,corpus)
        correction = find_position(typo,candidates)
        correction = pd.DataFrame(correction)

        if correction.empty:  
            output.append(typo)
            no_correction += 1

        else:
            correction.columns = ['Typo','Correction','old','new','index','type']
            correction = correction[correction['index'] >= 0]
            
            if len(correction) == 1:
                output.append(correction.loc[0,'Correction'])
            else:
                # 1. calculate the prior

                freq = [] # the number of times that the proposed correction c appears in the training set
                for cor in correction['Correction']:
                    freq.append(len(wrong[wrong[0] == cor]))    

                N = len(truth_clean) +len(teseract_clean) # the number of words
                vocabulary = truth_clean
                V = len(set(vocabulary)) # the vocabulary size 

                prior = (pd.DataFrame(freq) + 0.5)/(N + V/2)

                correction['probability of c'] = prior

                # 2. Calculate the likelihood
                correction['probability of t given c']='defalut value'
                correction['probability of c']='defalut value'

                probabilityfunction(correction)

                # 3. Calculate the posterior and find the correction that has maximum posterior

                correction['posterior'] = correction['probability of c'] * correction['probability of t given c']
                p = correction[correction['posterior'] == max(correction['posterior'])]
                maxp = p['Correction'][p['Correction'].index[0]]
                output.append(''.join(maxp))
    except:
        print(typo)
#         output.append(typo)
#         no_correction += 1
        break
    
        

HBox(children=(IntProgress(value=0, max=12648), HTML(value='')))



In [None]:
pd.Series(output).to_csv('correction.csv')

In [71]:
typo = 'taxwrtng'
candidates,cand_type = find_candidates(typo,corpus)
correction = find_position(typo,candidates)
correction = pd.DataFrame(correction)
correction.columns = ['Typo','Correction','old','new','index','type']

correction[correction['index'] >= 0]

Unnamed: 0,Typo,Correction,old,new,index,type
0,taxwrtng,taxing,w,#,3,insertion


In [72]:
correction.columns = ['Typo','Correction','old','new','index','type']
correction = correction[correction.index >= 0]

# 1. calculate the prior

freq = [] # the number of times that the proposed correction c appears in the training set
for cor in correction['Correction']:
    freq.append(len(wrong[wrong[0] == cor]))    

N = len(truth_clean) +len(teseract_clean) # the number of words
vocabulary = truth_clean
V = len(set(vocabulary)) # the vocabulary size 

prior = (pd.DataFrame(freq) + 0.5)/(N + V/2)

correction['probability of c'] = prior

# 2. Calculate the likelihood
correction['probability of t given c']='defalut value'
correction['probability of c']='defalut value'

probabilityfunction(correction)

# 3. Calculate the posterior and find the correction that has maximum posterior

# correction['posterior'] = correction['probability of c'] * correction['probability of t given c']
# p = correction[correction['posterior'] == max(correction['posterior'])]
# maxp = p['Correction'][p['Correction'].index[0]]
# output.append(''.join(maxp))



In [73]:
correction

Unnamed: 0,Typo,Correction,old,new,index,type,probability of c,probability of t given c
0,taxwrtng,taxing,w,#,3,insertion,,


In [79]:
X='taxing'[3-1]
Y='taxwrtng'[3]

In [77]:
add =confusionadd.loc[X,Y]
total=0
for z in range(0,len(truth_clean)):
    if X == '#':
        total += truth_clean[z].startswith(Y)
    else:
        total += truth_clean[z].count(X+Y)
    #lis.append(total)
# correction.iloc[i,6]=add/total

In [90]:
add

0

In [94]:
confusionadd.loc['x','w']

0

In [93]:
Yx

'w'