### Detection

In [2]:
from detection import *

words, label = labelTesseract()
train_data, test_data, train_label, test_label = div_train(words, label)
bigram_dict = compute_bigram()
featureMatrix_train = buildFeatures(train_data, bigram_dict)
featureMatrix_test = buildFeatures(test_data, bigram_dict)

# uncomment for testing
'''
head = featureMatrix_train.head()
print(head.to_string())
'''

# build classifier
svm_class = SVC(kernel='rbf', verbose=True, gamma='scale')
svm_class.fit(featureMatrix_train, train_label)

# prediction
prediction = svm_class.predict(featureMatrix_test)

output = pd.DataFrame({'data': test_data,
                       'label': prediction})

print(output[:20])

##### evaluation
#confustion Matrix
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(test_label, prediction))
print(classification_report(test_label, prediction))

[LibSVM]               data  label
0          proposal      1
1                1n      0
2           unclear      1
3              been      1
4          adequacy      1
5                 7      0
6               MCA      1
7               the      1
8               you      1
9   recommendatlons      0
10            thelr      1
11              the      1
12              Los      1
13            afflx      1
14    understandlng      0
15             that      1
16        posltlons      0
17              the      1
18         Drlnklng      0
19       1ndustrlal      0
[[12204  4508]
 [ 2622 24967]]
              precision    recall  f1-score   support

           0       0.82      0.73      0.77     16712
           1       0.85      0.90      0.88     27589

   micro avg       0.84      0.84      0.84     44301
   macro avg       0.84      0.82      0.82     44301
weighted avg       0.84      0.84      0.84     44301



In [232]:
typos = output[output.label == 0].reset_index(drop = True).data

In [326]:
import string
# remove puncutation and numbers
cleaned_typos = cleaned_typos.str.extract(r'([a-zA-Z]+)').dropna()[0]
cleaned_typos.reset_index(drop = True,inplace = True)
cleaned_typos = cleaned_typos.str.lower()

In [327]:
cleaned_typos.to_csv('cleaned_typos.csv')

### Dictionary

In [228]:
import glob
truth_counts = 0
corpus = []
# create a list of all .txt files
truth_files_list = glob.glob('../data/ground_truth/*.txt')
# reading the ground truth file
for file in truth_files_list:
    with open(file) as fd:
        for line in fd:
            each_line = line.strip().split()
            for word in each_line:
                corpus.append(word)
                truth_counts += 1
corpus = pd.Series(corpus)
corpus = corpus.str.extract(r'([a-zA-Z]+)').dropna()[0]
corpus = corpus.str.lower().unique()

### Find Candidates

In [329]:
import numpy as np
import pandas as pd
from collections import Counter 
from nltk import edit_distance

def typo_classification(typo,correct):
    if (len(typo) > len(correct)):
        return 'insertion'
    elif (len(typo) < len(correct)):
        return 'deletion'
    else:
        typo_count = Counter(typo)
        correct_count = Counter(correct)
        if typo_count == correct_count:
            return 'reversal'
        else:
            return 'subsititution'

def find_candidates(typo,corpus):
    candidates = []
    candi_type = []
    
    
    for word in corpus:
        ed = edit_distance(typo,word)
        
        word_type = typo_classification(typo,word)
        if len(typo) > 8:
            if ed in [1,2]:
                candidates.append(word)
                candi_type.append(word_type)
        else:
            if ((ed == 1) |((ed == 2) & (word_type == 'reversal'))):
                candidates.append(word)
                candi_type.append(word_type)
    return candidates,candi_type

def find_position(typo,candidates):
    position = []
    for corr in candidates:
        typo_type = typo_classification(typo,corr)
        
        if (typo_type == 'deletion'):
            typo += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if corr[i] != corr[i-1]:
                        typo = typo[:-1]
                        position.append([typo,corr,"@",corr[i],i,typo_type])
                        break
                    else:
                        typo = typo[:-1]
                        position.append([typo,corr,"@",corr[i],i,typo_type])
                        position.append([typo,corr,"@",corr[i],i+1,typo_type])
                        break
                        
                i += 1
        elif (typo_type == 'insertion'):
            corr += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if typo[i] != typo[i-1]:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"@",i,typo_type])
                        break
                    elif ((typo[i] == typo[i-1])& (typo[i] == typo[i-2])):
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"@",i,typo_type])
                        position.append([typo,corr,typo[i],"@",i-1,typo_type])
                        position.append([typo,corr,typo[i],"@",i-2,typo_type])
                        break
                    else:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"@",i,typo_type])
                        position.append([typo,corr,typo[i],"@",i-1,typo_type])
                        break
                i += 1
        elif (typo_type == 'subsititution'):
            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    position.append([typo,corr,typo[i],corr[i],i,typo_type])
                    break
                i+=1
                
        elif (typo_type == 'reversal'):
            i = 0
            while i < len(corr)-1:
                if ((typo[i] == corr[i+1]) & (typo[i+1] == corr[i])):
                    typo_comb = typo[i] + typo[i+1]
                    position.append([typo,corr,typo_comb,typo_comb[::-1],i,typo_type])
                    break
                i +=1
    return position

In [376]:
# imput lowercase typo
typo = cleaned_typos[123]
print(typo)
candidates,cand_type = find_candidates(typo,corpus)
correction = find_position(typo,candidates)

september


In [377]:
correction = pd.DataFrame(correction)
correction.columns = ['Typo','Correction','old','new','index','type']
correction

Unnamed: 0,Typo,Correction,old,new,index,type
0,september,stember,e,@,1,insertion
1,september,septemb,e,@,7,insertion
2,september,septembex,r,x,8,subsititution


In [367]:
#confusiondel=pd.read_csv('C:/Users/Alienware/Desktop/ads project1/Spring2019-Proj4-grp6/data/confusion_matrix/add_matrix.csv')
confusionsub=pd.read_csv('../data/confusion_matrix/sub_matrix.csv')
confusionadd=pd.read_csv('../data/confusion_matrix/add_matrix.csv')
confusiondel=pd.read_csv('../data/confusion_matrix/del_matrix.csv')
confusionrev=pd.read_csv('../data/confusion_matrix/rev_matrix.csv')

In [368]:
truth_counts = 0
teseract_clean = []
# create a list of all .txt files
truth_files_list = glob.glob('../data/tesseract/*.txt')
# reading the ground truth file
for file in truth_files_list:
    with open(file) as fd:
        for line in fd:
            each_line = line.strip().split()
            for word in each_line:
                teseract_clean.append(word)
                truth_counts += 1
teseract_clean = pd.Series(teseract_clean)
teseract_clean = teseract_clean.str.extract(r'([a-zA-Z]+)').dropna()[0]
teseract_clean = teseract_clean.str.lower().unique()

In [392]:
correction.values

array([['september', 'stember', 'e', '@', 1, 'insertion'],
       ['september', 'septemb', 'e', '@', 7, 'insertion'],
       ['september', 'septembex', 'r', 'x', 8, 'subsititution']],
      dtype=object)

In [393]:
for row in correction.values:
    typo = row[0]
    specificword = row[1]
    index = row[4]
    typo_type = row[5]
    
    print(index,typo_type)

1 insertion
7 insertion
8 subsititution


In [397]:
def probability_of_t_given_c(correction_df):
    correction_list = correction_df.values
    
    typo = row[0]
    specificword = row[1]
    index = row[4]
    typo_type = row[5]
    
    if typo_type =='insertion':
        if index != 0:
            
            #index=correction.iloc[i,4]
            X=specificword[index-1]
            Y=specificword[index]
            haha=confusionadd[confusionadd.iloc[:,0]==X]
            add=haha[Y].iloc[0]
            total=0
            for z in range(0,len(truth_clean)):
                total=total+truth_clean[z].count(X+Y)
                #lis.append(total)
                result =add/total
        if index == 0:
            X='#'
            Y=specificword[index]
            haha=confusionadd[confusionadd.iloc[:,0]==X]
            add=haha[Y].iloc[0]
            totall=len(truth_clean)
            
            result =add/totall
            
    if correction.iloc[i,5]=='deletion':
        if index != 0:
            
            #index=correction.iloc[i,4]
            X=specificword[index-1]
            Y=specificword[index]
            haha=confusiondel[confusiondel.iloc[:,0]==X]
            delt=haha[Y].iloc[0]
            total=0
            for z in range(0,len(truth_clean)):
                total=total+truth_clean[z].count(X+Y)
                #lis.append(total)
                result =delt/total
            
            
        if index == 0:
            X='#'
            Y=specificword[index]
            haha=confusiondel[confusiondel.iloc[:,0]==X]
            delt=haha[Y].iloc[0]
            totall=len(truth_clean)
            
            result =delt/totall
            
    if correction.iloc[i,5]=='reversal':
        
            
            #index=correction.iloc[i,4]
            X=specificword[index]
            Y=specificword[index+1]
            haha=confusionrev[confusionrev.iloc[:,0]==X]
            rev=haha[Y].iloc[0]
            total=0
            for z in range(0,len(truth_clean)):
                total=total+truth_clean[z].count(X+Y)
                #lis.append(total)
                result =rev/total
        
                
    if correction.iloc[i,5]=='substitution':
        X=correction.iloc[i,2]
        Y=correction.iloc[i,3]
        heihei=confusionsub[confusionsub.iloc[:,0]==X]
        sub=heihei[Y].iloc[0]
        #lis.append(sub)

        total=0
        for z in range(0,len(truth_clean)):
            total=total+truth_clean[z].count(Y)
            #lis.append(total)
            result =sub/total
    

In [373]:
N=len(truth_clean)+len(teseract_clean)
V=len(set(truth_clean))+len(set(teseract_clean))
for a in range(0,correction.shape[0]):
    cor=correction.iloc[a,1]
    correction.iloc[a,7]=((truth_clean.tolist().count(cor)+teseract_clean.tolist().count(cor)) + 0.5)/(N + V/2)



In [374]:
correction

Unnamed: 0,Typo,Correction,old,new,index,type,probability of t given c,probability of c
0,september,stember,e,@,1,insertion,0.05062,4.7e-05
1,september,septemb,e,@,7,insertion,0.0,4.7e-05
2,september,septembex,r,x,8,subsititution,0.0,4.7e-05


In [361]:
range(0,correction.shape[0])

range(0, 3)

In [362]:
truth_clean.tolist().count(cor)

1