#### Loading packages

In [1]:
import glob
import errno
import string
import numpy as np
import csv
import pandas as pd
import time
import os

#### Creating corpus

In [3]:
truth = []
path = '../data/ground_truth/*.txt'
files = glob.glob(path)
for name in files:
    try:
        with open(name, encoding='utf8') as f:
            for line in f:
                out = line.translate(str.maketrans('', '', string.punctuation))
                out = ''.join([i for i in out if not i.isdigit()])
                out = out.lower().split()
                truth.extend(out)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise
np.savetxt('../output/truth_corpus.dat', truth, fmt='%s', encoding='utf8')

tess = []
path = '../data/tesseract/*.txt'
files = glob.glob(path)
for name in files:
    try:
        with open(name, encoding='utf8') as f:
            for line in f:
                out = line.translate(str.maketrans('', '', string.punctuation))
                out = ''.join([i for i in out if not i.isdigit()])
                out = out.lower().split()
                tess.extend(out)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise
np.savetxt('../output/tess_corpus.dat', tess, fmt='%s', encoding='utf8')

#### Reading data for feature extraction

In [4]:
Error = []
Truth = []
pair = []
with open('../data/Error_df_rules_based.csv', encoding='utf8') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        err = row[1].lower()
        trt = row[2].lower()
        if err != trt and [err, trt] not in pair:
            Error.append(err)
            Truth.append(trt)
            pair.append([err, trt])
            
Error = Error[2:]
Truth = Truth[2:]

#### Feature extraction

In [4]:
from lib.feature_scoring import n_gram
from lib.feature_scoring import candidate_search
from lib.feature_scoring import LED_score
from lib.feature_scoring import SS_score
from lib.feature_scoring import LP_score
from lib.feature_scoring import ECP_score
from lib.feature_scoring import RCP_score

In [None]:
W_error=['Typo']
W_truth=['Truth']
W_cand = ['Candidate']
Label = ['Label']
LED = ['led_score']
SS = ['ss_score']
LP = ['lp_score']
ECP = ['ECP_score']

n = 3 # n_gram
for i in range(len(Error)):
    w_e = Error[i]
    w_c = Truth[i]
    cand_list = candidate_search(truth, w_e)
    print('word ',i+1,', error: ', w_e, ', truth: ', w_c)
    
#    gram_list = n_gram(w_e, tess, n)
    LP_freq = []
#    ECP_freq = []
    for s in cand_list:
        lp_freq = LP_score(s, truth)
        LP_freq.append(lp_freq)
#        ecp_freq = ECP_score(gram_list, s, truth, n)
#       ECP_freq.append(ecp_freq)
        
    for j in range(len(cand_list)):
        s = cand_list[j]
        led = LED_score(w_e, s)
        ss = SS_score(w_e, s, N=3)
        lp = LP_score(s, truth)/max(LP_freq)
#        if max(ECP_freq)==0: ecp=0
#        else: ecp = ECP_score(gram_list, s, truth, n)/max(ECP_freq)
#        rcp = RCP_score(w_e, s, tess, truth)
        label = int(s == w_c)
#        print('candidate:', s, '\tscores =', '{:03.2f}'.format(led),', {:03.2f}'.format(ss),', {:06.5f}'.format(lp), '\tlabel=', label)
        W_error.append(w_e)
        W_truth.append(w_c)
        W_cand.append(s)
        Label.append(label)
        LED.append(led)
        SS.append(ss)
        LP.append(lp)
        #ECP.append(ecp)

In [12]:
np.savetxt('../output/feature.csv', [p for p in zip(W_error, W_truth, W_cand, LED, SS, LP, Label)], delimiter=',', fmt='%s', encoding='utf-8')

## Parameter Tunning

#### Retreive Data

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor

In [6]:
feature_output = pd.read_csv('../output/feature.csv', delimiter = ',')

In [8]:
feature_output.head(20)

Unnamed: 0,Typo,Truth,Candidate,led_score,ss_score,lp_score,Label
0,willlam,william,will,0.25,1.636364,1.0,0
1,willlam,william,willful,0.25,1.607143,0.001712,0
2,willlam,william,william,0.75,2.142857,0.039954,1
3,willlam,william,williams,0.5,1.866667,0.003995,0
4,willlam,william,willing,0.25,1.285714,0.006849,0
5,willlam,william,wills,0.25,1.5,0.000571,0
6,nvolvng,involving,cooling,0.25,0.857143,0.011494,0
7,nvolvng,involving,evolve,0.25,0.961538,0.045977,0
8,nvolvng,involving,evolved,0.25,0.892857,0.034483,0
9,nvolvng,involving,evolves,0.25,0.892857,0.011494,0


In [35]:
X = feature_output[feature_output.columns[0:6]]
y = feature_output["Label"]

#### Train & Test split

In [65]:
from sklearn.model_selection import GroupShuffleSplit
group = pd.Categorical(feature_output["Typo"])
train_inds, test_inds = next(GroupShuffleSplit(random_state=42).split(X, y, group))
X_train, X_test, y_train, y_test = X.iloc[list(train_inds)], X.iloc[list(test_inds)], y.iloc[list(train_inds)], y.iloc[list(test_inds)]
train_words = X_train[X_train.columns[0:3]]
test_words = X_test[X_test.columns[0:3]]
X_train = X_train[X_train.columns[3:6]]
X_test = X_test[X_test.columns[3:6]]

In [66]:
print('X_train shape:',X_train.shape,'\n','y_train shape:',y_train.shape)

X_train shape: (1238133, 3) 
 y_train shape: (1238133,)


In [67]:
print('X_test shape:',X_test.shape,'\n','y_test shape:',y_test.shape)

X_test shape: (165302, 3) 
 y_test shape: (165302,)


## Ada Boost

In [9]:
model = AdaBoostRegressor()

## Parameters Tunning

In [14]:
parameters = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
 }

In [23]:
start = time.time()
ada_grid_search = GridSearchCV(model,parameters,cv = 3)
ada_grid_search_fit = ada_grid_search.fit(X_train, y_train)
end = time.time()
print('Time:',end - start)

Time: 4493.107969999313


In [24]:
ada_grid_search_fit.best_params_

{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 50}

In [25]:
ada_grid_search_fit.best_estimator_

AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='exponential',
         n_estimators=50, random_state=None)

In [26]:
print(ada_grid_search_fit.best_score_)

0.43120006350478934


## Prediction

In [106]:
start = time.time()
regessor = AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='exponential',
         n_estimators=50, random_state=None)
regessor_fit = regessor.fit(X_train, y_train)
end = time.time()
print('Time:',end - start)

Time: 65.61470174789429


In [69]:
result = regessor.predict(X_test)

In [107]:
predicted_confidence = pd.DataFrame({"predicted_confidence": result})
test_typo =  pd.DataFrame({"typo": np.array(test_words['Typo'])})
test_truth = pd.DataFrame({"truth": np.array(test_words['Truth'])})
test_candidate = pd.DataFrame({"candidate": np.array(test_words['Candidate'])})
label = pd.DataFrame({"label": np.array(y_test)})

In [87]:
unsorted_test_final_output = pd.concat([test_typo, test_truth, test_candidate, predicted_confidence, label], axis=1)
unsorted_test_final_output.head(20)

Unnamed: 0,typo,truth,candidate,predicted_confidence,label
0,cm,cma,a,0.000558,0
1,cm,cma,aa,0.000558,0
2,cm,cma,aach,0.000558,0
3,cm,cma,aad,0.000558,0
4,cm,cma,aai,0.000558,0
5,cm,cma,aam,0.000558,0
6,cm,cma,aar,0.000558,0
7,cm,cma,ab,0.000558,0
8,cm,cma,abc,0.000558,0
9,cm,cma,abcs,0.000558,0


## Measurement

In [141]:
#define files path
tess_dir = "../data/tesseract/"
ground_dir = "../data/ground_truth_trimmed/"
file_name = os.listdir(tess_dir)

In [142]:
#Avreage Number of Candidates 
feature_output.shape[0]/feature_output[feature_output.Label==1].shape[0]

373.4526343799894

In [143]:
#Top_n candidates
candidate_10 = unsorted_test_final_output.groupby("typo").apply(lambda x: x.nlargest(10,columns = 'predicted_confidence'))
candidate_5 = unsorted_test_final_output.groupby("typo").apply(lambda x: x.nlargest(5,columns = 'predicted_confidence'))
candidate_3 = unsorted_test_final_output.groupby("typo").apply(lambda x: x.nlargest(3,columns = 'predicted_confidence'))
candidate_1 = unsorted_test_final_output.groupby("typo").apply(lambda x: x.nlargest(1,columns = 'predicted_confidence'))

In [144]:
candidate_10.loc['acrsv']

Unnamed: 0,typo,truth,candidate,predicted_confidence,label
141602,acrsv,acrs,acrs,0.097398,1
141598,acrsv,acrs,acr,0.053191,0
141601,acrsv,acrs,across,0.053191,0
141599,acrsv,acrs,acra,0.000566,0
141600,acrsv,acrs,acre,0.000566,0
141603,acrsv,acrs,acs,0.000566,0
141604,acrsv,acrs,acsh,0.000561,0
141608,acrsv,acrs,activ,0.000561,0
141611,acrsv,acrs,acts,0.000561,0
141679,acrsv,acrs,cars,0.000561,0


In [151]:
#Top_n candidates wordwise precision
total_typo = sum(y_test==1)
top_10= "{0:.2%}".format(candidate_10[candidate_10.label==1].shape[0]/total_typo)
top_5= "{0:.2%}".format(candidate_5[candidate_5.label==1].shape[0]/total_typo)
top_3= "{0:.2%}".format(candidate_3[candidate_3.label==1].shape[0]/total_typo)
top_1= "{0:.2%}".format(candidate_1[candidate_1.label==1].shape[0]/total_typo)

top = pd.DataFrame({"top": np.array([1,3,5,10])})
precision = pd.DataFrame({"precision": np.array([top_1, top_3, top_5, top_10])})
pd.concat([top, precision], axis=1)

Unnamed: 0,top,precision
0,1,72.10%
1,3,85.77%
2,5,87.98%
3,10,90.61%
