In [1]:
import os
import time
import warnings
from Corrector import *
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split

### Prepare the Data

In [3]:
# working directory of 'Spring2019-Proj4-grp9'
pwd = os.path.dirname(os.getcwd())
# set the working directory of processed data
Correction_wd = os.path.join(pwd, "output", "Correction")

In [7]:
if os.path.exists(os.path.join(Correction_wd, "dataset.pkl")):
    dataset = pd.read_pickle(os.path.join(Correction_wd, "dataset.pkl"))
else:
    tic = time.clock()
    # This is a Corrector object we defined for storing/processing data
    corrector = Corrector()
    # All the error and true words
    We = list(corrector.error_text['WORD_OCR'])
    Truth = list(corrector.error_text['WORD_TRUE'])
    
    # Initialize the dataset
    dataset = pd.DataFrame()
    
    # Compute those scores
    for i in range(len(We)):
        data_piece = pd.DataFrame()
        for Threshold in range(10):
            candidates = corrector.candidate_search(We[i], Threshold)
            if len(candidates) >= 10:
                break

        dist_score = distance_score(candidates, We[i], Threshold)
        simi_score = similarity_score(candidates, We[i])
        popu_score = popularity_score(candidates)
        exis_score = existance_score(candidates, corrector.lexicon)
        exat_score = exact_popularity_score(candidates, We[i], three_gram(corrector.error_text.iloc[i]),
                                            corrector.dictionary_exact)
        rela_score = relaxed_popularity_score(candidates, We[i], three_gram(corrector.error_text.iloc[i]),
                                              corrector.dictionary_relaxed)

        data_piece["We"] = [We[i]] * len(candidates)
        data_piece["Wc"] = candidates.keys()
        data_piece["x1"] = dist_score.values()
        data_piece["x2"] = simi_score.values()
        data_piece["x3"] = popu_score.values()
        data_piece["x4"] = exis_score.values()
        data_piece["x5"] = exat_score.values()
        data_piece["x6"] = rela_score.values()

        label_list = []
        for j in range(len(candidates)):
            if data_piece.iloc[j, 1] == Truth[i]:
                label_list.append(1)
            else:
                label_list.append(0)
        data_piece["label"] = label_list
        dataset = dataset.append(data_piece, ignore_index=True)
    
    dataset.to_pickle(os.path.join(Correction_wd, "dataset.pkl"))
    toc = time.clock()
    print(toc-tic)

### Train the Regressor

#### Split the training and testing data

In [22]:
We = list(set(dataset.We))
We_train, We_test = train_test_split(We, test_size = 0.25)
train_index = [We in We_train for We in dataset.We]
train_data = dataset[train_index]
test_index = [not boolean for boolean in train_index]
test_data = dataset[test_index]

In [23]:
train_X = train_data.iloc[:,2:8]
train_y = train_data.label
test_X = test_data.iloc[:,2:8]
test_y = test_data.label

#### Fit the regressor

In [24]:
regressor = AdaBoostRegressor()
regressor.fit(train_X, train_y)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [25]:
predicttion_raw = regressor.predict(test_X).tolist()
test_data['prediction'] = predicttion_raw

In [26]:
result = test_data.groupby('We', as_index=False).max()

In [27]:
sum(result.label)/len(result.label)

0.884068278805121

In [33]:
result[25:30]

Unnamed: 0,We,Wc,x1,x2,x3,x4,x5,x6,label,prediction
25,advlslng,revising,0.6,0.65625,1.0,1,1.0,1.0,1,0.715614
26,affalr,unfair,0.75,0.833333,1.0,1,0.0,1.0,1,0.878468
27,afnlm,safely,0.5,0.611111,1.0,1,1.0,1.0,1,0.594569
28,agam,team,0.666667,0.857143,1.0,1,1.0,1.0,1,0.592806
29,agrlcultural,unnatural,0.875,0.8125,1.0,1,0.0,1.0,1,0.878468
