#### Loading packages

In [96]:
import glob
import errno
import string
import numpy as np
import csv
import pandas as pd
import time
import os

#### Creating corpus

In [2]:
truth = []
path = 'data/ground_truth/*.txt'
files = glob.glob(path)
for name in files:
    try:
        with open(name, encoding='utf8') as f:
            for line in f:
                out = line.translate(str.maketrans('', '', string.punctuation))
                out = ''.join([i for i in out if not i.isdigit()])
                out = out.lower().split()
                truth.extend(out)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise
np.savetxt('output/truth_corpus.dat', truth, fmt='%s', encoding='utf8')

tess = []
path = 'data/tesseract/*.txt'
files = glob.glob(path)
for name in files:
    try:
        with open(name, encoding='utf8') as f:
            for line in f:
                out = line.translate(str.maketrans('', '', string.punctuation))
                out = ''.join([i for i in out if not i.isdigit()])
                out = out.lower().split()
                tess.extend(out)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise
np.savetxt('output/tess_corpus.dat', tess, fmt='%s', encoding='utf8')

#### Reading data for feature extraction

In [3]:
Error = []
Truth = []
pair = []
with open('data/Error_df_rules_based.csv', encoding='utf8') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        err = row[1].lower()
        trt = row[2].lower()
        if err != trt and [err, trt] not in pair:
            Error.append(err)
            Truth.append(trt)
            pair.append([err, trt])
            
Error = Error[2:]
Truth = Truth[2:]

#### Feature extraction

In [4]:
from lib.feature_scoring import n_gram
from lib.feature_scoring import candidate_search
from lib.feature_scoring import LED_score
from lib.feature_scoring import SS_score
from lib.feature_scoring import LP_score
from lib.feature_scoring import ECP_score
from lib.feature_scoring import RCP_score

In [7]:
W_error=['Typo']
W_truth=['Truth']
W_cand = ['Candidate']
Label = ['Label']
LED = ['led_score']
SS = ['ss_score']
LP = ['lp_score']
ECP = ['ECP_score']

n = 3 # n_gram
for i in range(len(Error)):
    w_e = Error[i]
    w_c = Truth[i]
    cand_list = candidate_search(truth, w_e)
    print('word ',i+1,', error: ', w_e, ', truth: ', w_c)
    
#    gram_list = n_gram(w_e, tess, n)
    LP_freq = []
#    ECP_freq = []
    for s in cand_list:
        lp_freq = LP_score(s, truth)
        LP_freq.append(lp_freq)
#        ecp_freq = ECP_score(gram_list, s, truth, n)
#       ECP_freq.append(ecp_freq)
        
    for j in range(len(cand_list)):
        s = cand_list[j]
        led = LED_score(w_e, s)
        ss = SS_score(w_e, s, N=3)
        lp = LP_score(s, truth)/max(LP_freq)
#        if max(ECP_freq)==0: ecp=0
#        else: ecp = ECP_score(gram_list, s, truth, n)/max(ECP_freq)
#        rcp = RCP_score(w_e, s, tess, truth)
        label = int(s == w_c)
#        print('candidate:', s, '\tscores =', '{:03.2f}'.format(led),', {:03.2f}'.format(ss),', {:06.5f}'.format(lp), '\tlabel=', label)
        W_error.append(w_e)
        W_truth.append(w_c)
        W_cand.append(s)
        Label.append(label)
        LED.append(led)
        SS.append(ss)
        LP.append(lp)
        #ECP.append(ecp)

word  1 , error:  willlam , truth:  william
word  2 , error:  nvolvng , truth:  involving
word  3 , error:  t , truth:  the
word  4 , error:  aflcid , truth:  aflcio
word  5 , error:  cmng , truth:  attaching
word  6 , error:  admlnlstratlons , truth:  administrations
word  7 , error:  d , truth:  gold
word  8 , error:  jurlsdlctlon , truth:  jurisdiction
word  9 , error:  m , truth:  to
word  10 , error:  mm , truth:  bill


KeyboardInterrupt: 

In [12]:
np.savetxt('output/feature.csv', [p for p in zip(W_error, W_truth, W_cand, LED, SS, LP, Label)], delimiter=',', fmt='%s', encoding='utf-8')

## Parameter Tunning

#### Retreive Data

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor

In [3]:
feature_output = pd.read_csv('output/feature.csv', delimiter = ',')

In [4]:
feature_output.head(20)

Unnamed: 0,Typo,Truth,Candidate,led_score,ss_score,lp_score,Label
0,willlam,william,will,0.25,1.636364,1.0,0
1,willlam,william,willful,0.25,1.607143,0.001712,0
2,willlam,william,william,0.75,2.142857,0.039954,1
3,willlam,william,williams,0.5,1.866667,0.003995,0
4,willlam,william,willing,0.25,1.285714,0.006849,0
5,willlam,william,wills,0.25,1.5,0.000571,0
6,nvolvng,involving,cooling,0.25,0.857143,0.011494,0
7,nvolvng,involving,evolve,0.25,0.961538,0.045977,0
8,nvolvng,involving,evolved,0.25,0.892857,0.034483,0
9,nvolvng,involving,evolves,0.25,0.892857,0.011494,0


In [5]:
X = feature_output[feature_output.columns[3:6]]
y = feature_output["Label"]

#### Train & Test split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True ,random_state=42)

In [7]:
print('X_train shape:',X_train.shape,'\n','y_train shape:',y_train.shape)

X_train shape: (1122748, 3) 
 y_train shape: (1122748,)


In [8]:
print('X_test shape:',X_test.shape,'\n','y_test shape:',y_test.shape)

X_test shape: (280687, 3) 
 y_test shape: (280687,)


## Ada Boost

In [9]:
model = AdaBoostRegressor()

## Parameters Tunning

In [14]:
parameters = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
 }

In [23]:
start = time.time()
ada_grid_search = GridSearchCV(model,parameters,cv = 3)
ada_grid_search_fit = ada_grid_search.fit(X_train, y_train)
end = time.time()
print('Time:',end - start)

Time: 4493.107969999313


In [24]:
ada_grid_search_fit.best_params_

{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 50}

In [25]:
ada_grid_search_fit.best_estimator_

AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='exponential',
         n_estimators=50, random_state=None)

In [26]:
print(ada_grid_search_fit.best_score_)

0.43120006350478934


## Prediction

In [10]:
start = time.time()
regessor = AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='exponential',
         n_estimators=50, random_state=None)
regessor_fit = regessor.fit(X, y)
end = time.time()
print('Time:',end - start)

Time: 50.80421447753906


In [11]:
result = regessor.predict(X)

In [12]:
predicted_confidence = pd.DataFrame({"predicted_confidence": result})
unsorted_test_final_output = pd.concat([feature_output, predicted_confidence], axis=1)
unsorted_test_final_output

Unnamed: 0,Typo,Truth,Candidate,led_score,ss_score,lp_score,Label,predicted_confidence
0,willlam,william,will,0.25,1.636364,1.000000,0,0.413569
1,willlam,william,willful,0.25,1.607143,0.001712,0,0.079066
2,willlam,william,william,0.75,2.142857,0.039954,1,0.615429
3,willlam,william,williams,0.50,1.866667,0.003995,0,0.095706
4,willlam,william,willing,0.25,1.285714,0.006849,0,0.058059
5,willlam,william,wills,0.25,1.500000,0.000571,0,0.058059
6,nvolvng,involving,cooling,0.25,0.857143,0.011494,0,0.024408
7,nvolvng,involving,evolve,0.25,0.961538,0.045977,0,0.026367
8,nvolvng,involving,evolved,0.25,0.892857,0.034483,0,0.026367
9,nvolvng,involving,evolves,0.25,0.892857,0.011494,0,0.026367


## Measurement

In [97]:
#define files path
tess_dir = "./data/tesseract/"
ground_dir = "./data/ground_truth_trimmed/"
file_name = os.listdir(tess_dir)

In [81]:
sorted_test_final_output = unsorted_test_final_output.groupby("Typo").apply(lambda x: x.nlargest(10,columns = 'predicted_confidence'))

In [141]:
correct_error= sorted_test_final_output[sorted_test_final_output.Label==1]
correct_error.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Typo,Truth,Candidate,led_score,ss_score,lp_score,Label,predicted_confidence
Typo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
abllltles,409920,abllltles,abilities,abilities,0.25,1.222222,1.0,1,0.059123
ablllty,512938,ablllty,ability,ability,0.5,1.178571,1.0,1,0.242354
abollsh,440280,abollsh,abolish,abolish,0.75,2.142857,0.02439,1,0.564863
acceptlng,810793,acceptlng,accepting,accepting,0.75,3.333333,0.068966,1,0.640412
accesslng,1155948,accesslng,accessing,accessing,0.75,3.333333,0.017241,1,0.557403


In [105]:
#number of errors with correct prediction
num_added_correct = correct.shape[0]
num_added_correct

3222

In [98]:
#read tesseract outputs
Tess_dir = []
for file in file_name:
    with open(os.path.join(tess_dir,file),encoding='utf-8') as f:
        for line in f:
            for word in line.split():
                if word not in string.punctuation and not word.isdigit():
                    Tess_dir.append(word.lower())

In [100]:
#read ground truth outputs
Truth_dir = []
for file in file_name:
    with open(os.path.join(ground_dir,file),encoding='utf-8') as f:
        for line in f:
            for word in line.split():
                if word not in string.punctuation and not word.isdigit():
                    Truth_dir.append(word.lower())

In [None]:
#word-wise measurement
correct_set = set(Tess_dir).intersection(set(Truth_dir))
num_correct = len([word for word in Tess_dir if word in correct_set])
num_OCR_opt = len(Tess_dir)
num_ground_t= len(Truth_dir)

word_wise_precision_before = num_correct / num_OCR_opt
word_wise_recall_before = num_correct / num_ground_t
word_wise_precision_after= (num_correct+num_added_correct) / num_OCR_opt
word_wise_recall_after =(num_correct + num_added_correct) / num_ground_t


In [None]:
#character-wise measurement

In [113]:
Tess_char = []
for word in Tess_dir:
    Tess_char.extend([char for char in word])

In [117]:
Truth_char = []
for word in Truth_dir:
    Truth_char.extend([char for char in word])

In [125]:
new_correct_char = []
for word in correct_error.Truth:
    new_correct_char.extend([char for char in word])

In [None]:
correct_char = set(Tess_char).intersection(set(Truth_char))
num_correct_char = len([word for word in Tess_char if word in correct_char])
num_OCR_char = len(Tess_char)
num_truth_char = len(Truth_char)
num_new_char = len(new_correct_char)

char_wise_precision_before = num_correct_char / num_OCR_char
char_wise_recall_before = num_correct_char / num_truth_char
char_wise_precision_after= (num_correct_char +num_new_char) / num_OCR_char
char_wise_recall_after =(num_correct_char +num_new_char)  / num_truth_char

In [142]:
# create measurement table
col1 = [word_wise_recall_before,word_wise_precision_before,char_wise_recall_before,char_wise_precision_before]
col2 = [word_wise_recall_after,word_wise_precision_after,char_wise_recall_after,char_wise_precision_after]
data = {"Tesseract" : col1, "Tesseract_with_postprocessing":col2}
df = pd.DataFrame(data,index=["word_wise_recall","word_wise_precision","character_wise_recall","character_wise_precision"])
df

Unnamed: 0,Tesseract,Tesseract_with_postprocessing
word_wise_recall,0.678508,0.689616
word_wise_precision,0.679021,0.690137
character_wise_recall,0.980081,0.998586
character_wise_precision,0.999896,1.018775


#### Train Test Split by Group (Delete if we don't need this.)

In [97]:
from sklearn.model_selection import GroupShuffleSplit
group = pd.Categorical(feature_output["Typo"])
train_inds, test_inds = next(GroupShuffleSplit(random_state=42).split(X, y, group))
X_train, X_test, y_train, y_test = X.iloc[list(train_inds)], X.iloc[list(test_inds)], y.iloc[list(train_inds)], y.iloc[list(test_inds)]