<h1> Predictor typos in US - SINGLE </h1>

* <b>Retrieves incrementally dataframes with n% of typos from <i>typos_generator</i></b>
* <b>In each case, executes one hot encoding of categorical variables</b>
* <b>Retrieves base MSE of data with n-typos</b>
* <b>Stores metrics for further statistics</b>

In [10]:
# imports
import copy
import pandas as pd 
import numpy as np
from sklearn import metrics  
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder
import random
import string
from collections import OrderedDict
import time

import import_ipynb
import typos_generator as typos_generator

%matplotlib inline

# Data Split

In [11]:
# Load the Wine Dataset
df_wine = pd.read_csv("df_wine_horizontal_tests.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()
# check data
df_wine.head(4)

Unnamed: 0,index,country,province,region,price,variety,points,year_of_wine
0,0,US,Oregon,Willamette Valley,14.0,Pinot Gris,87,2013
1,1,US,Michigan,Lake Michigan Shore,13.0,Riesling,87,2013
2,2,US,Oregon,Willamette Valley,65.0,Pinot Noir,87,2012
3,3,Spain,Northern Spain,Navarra,15.0,Tempranillo-Merlot,87,2011


In [12]:
# drop unnecessary columns
df_wine = df_wine.drop(['province', 'region', 'price', 'variety', 'year_of_wine'], axis=1)
df_wine.head(4)

Unnamed: 0,index,country,points
0,0,US,87
1,1,US,87
2,2,US,87
3,3,Spain,87


In [13]:
df_X = df_wine.drop('points', axis=1)
df_Y = df_wine[['points']]
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.3, random_state=0) 

In [14]:
X_train_copy = copy.deepcopy(X_train)
X_train_copy.head(4)

Unnamed: 0,index,country
82970,86163,US
4606,4766,US
27058,28094,France
84009,87245,France


In [15]:
# check number of US values in X_train
frequencies = X_train_copy["country"].value_counts()
frequencies.sort_values
print(frequencies)

US           37282
France       11412
Italy        11173
Spain         4227
Argentina     2563
Australia     1568
Canada         167
Name: country, dtype: int64


# One hot encoding (constant)

In [16]:
X_test = pd.get_dummies(X_test, columns=['country'])

# Setting: number of experinents

In [17]:
n = [10, 20, 30, 40, 50]

# Results Typos in Data

In [18]:
i = 0
results = list()


typos = list()

for X_train_typos in typos_generator.generate_dirty_data(X_train_copy, "country", "US", n):
    # store data with typos for further cleaning
    typos.append(X_train_typos)
    
    print("[OK] Experiment {}: ".format(i+1))
    start_time = time.time()
    # get hyperparameters
    # max_depth, min_samples_split = get_hyperparameters(X_train, y_train)
    # print("max_depth: {}, min_samples_split: {}".format(max_depth, min_samples_split))
    
    # one hot encoding (variable)
    X_train_typos = pd.get_dummies(X_train_typos, columns=['country'])
    
    X_train_typos, X_test = X_train_typos.align(X_test, join='outer', axis=1, fill_value=0)
    
    # print(X_train_typos.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # apply regression
    regressor = DecisionTreeRegressor()  
    regressor.fit(X_train_typos, y_train) # Train the model using the training sets
    y_pred = regressor.predict(X_test) # Make predictions using the testing set

    # The evaluation metrics
    mae = round(metrics.mean_absolute_error(y_test, y_pred), 4)
    mse = round(metrics.mean_squared_error(y_test, y_pred), 4)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 4)
    elapsed_time = round(time.time() - start_time, 2)
    
    print('Mean Absolute Error:', mae)  
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('Elapsed time: ', elapsed_time)
    print("*****"*20)
    
    # store results
    result = dict(
        n=i+1,
        columns=len(X_train_typos.columns),
        elapsed_time=elapsed_time,
        results=dict(mae=mae, mse=mse, rmse=rmse)
    )
    results.append(result)
    i += 1

****************************************************************************************************
[OK] Inserting typos in 3728 out of 37282 rows in US. (10%)
[OK] Experiment 1: 
Mean Absolute Error: 0.3245
Mean Squared Error: 1.8552
Root Mean Squared Error: 1.3621
Elapsed time:  21.14
****************************************************************************************************
[OK] Inserting typos in 7456 out of 37282 rows in US. (20%)
[OK] Experiment 2: 
Mean Absolute Error: 0.3259
Mean Squared Error: 1.8677
Root Mean Squared Error: 1.3666
Elapsed time:  50.38
****************************************************************************************************
[OK] Inserting typos in 11184 out of 37282 rows in US. (30%)
[OK] Experiment 3: 
Mean Absolute Error: 0.3272
Mean Squared Error: 1.8806
Root Mean Squared Error: 1.3713
Elapsed time:  90.24
****************************************************************************************************
[OK] Inserting typos in 14912 o

In [19]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)

[   {   'columns': 3016,
        'elapsed_time': 21.14,
        'n': 1,
        'results': {'mae': 0.3245, 'mse': 1.8552, 'rmse': 1.3621}},
    {   'columns': 7925,
        'elapsed_time': 50.38,
        'n': 2,
        'results': {'mae': 0.3259, 'mse': 1.8677, 'rmse': 1.3666}},
    {   'columns': 13668,
        'elapsed_time': 90.24,
        'n': 3,
        'results': {'mae': 0.3272, 'mse': 1.8806, 'rmse': 1.3713}},
    {   'columns': 19525,
        'elapsed_time': 142.28,
        'n': 4,
        'results': {'mae': 0.3282, 'mse': 1.8856, 'rmse': 1.3732}},
    {   'columns': 25310,
        'elapsed_time': 204.86,
        'n': 5,
        'results': {'mae': 0.3322, 'mse': 1.9309, 'rmse': 1.3896}}]


<h1> Predictor cleaned typos in country </h1>

* <b>Iterates through dataframes with n% of typos (previously stored)</b>
* <b>In each case, detects and clean typos</b>
* <b>In each case, executes one hot encoding of categorical variables</b>
* <b>Retrieves base MSE of data with n-typos</b>
* <b>Stores metrics for further statistics</b>

# Typos cleaner

In [20]:
def clean_value(value, original, percent):
    typos_chars = list(value)
    items = list(original)
    
    check_list = dict()
    for item in items:
        check_list[item] = False
    
    # check if value contains all chars of original
    for typo in typos_chars:
        if typo in original:
            if check_list[typo]:
                continue
            check_list[typo] = True
    
    # how many characters from the original were found, 
    # correctnes 100 means, all characters in the original were found
    correctness = int((percent * len(original)) / 100.0)
    corrected = len([x for x in check_list.values() if x])

    if corrected < correctness:
    # print("[ERROR] Couldnt clean: {}.".format(value))
        return value

    return original

# Typos detection

In [21]:
def detect_typos(df, column_name, column_value, expected_distinct_values, correctness):
    i = 0
    b = 0
    print("[OK] Clean correctness: {}%".format(correctness))
    for index, row in df.iterrows():
        value = row[column_name]
        if value in expected_distinct_values:
            continue
        
        new_value = clean_value(value, column_value, correctness)
        
        if value != new_value:
            df.at[index, column_name] = new_value
            # print("[OK] Cleaning {} results in {}".format(value, new_value))
            i += 1
        else:
            # discard the row that couldnt be cleaned
            # df.at[index, column_name] = np.NaN
            b += 1
    
    # print("[OK] Typos detected: {}".format(b+i))
    # print("[OK] Cleaned: {} rows".format(i))
    print("[WARN] Couldn't clean {} rows".format(b)) 
    return df

# Result CLEANED typos

In [22]:
# data cleaning settings
expected_distinct_values = ['US', 'Spain', 'Italy', 'France', 'Argentina', 'Australia', 'Canada']
correctness = 100

i = 0
results = list()


for X_train_typos in typos:
    print("[OK] Experiment {}: ".format(i+1))
    start_time = time.time()
    
    # data detect and clean
    X_train_typos = detect_typos(X_train_typos, "country", "US", expected_distinct_values, correctness)
    
    # one hot encoding (variable)
    X_train_typos = pd.get_dummies(X_train_typos, columns=['country'])
    
    X_train_typos, X_test = X_train_typos.align(X_test, join='outer', axis=1, fill_value=0)
    
    # print(X_train_typos.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # apply regression
    regressor = DecisionTreeRegressor()  
    regressor.fit(X_train_typos, y_train) # Train the model using the training sets
    y_pred = regressor.predict(X_test) # Make predictions using the testing set

    # The evaluation metrics
    mae = round(metrics.mean_absolute_error(y_test, y_pred), 4)
    mse = round(metrics.mean_squared_error(y_test, y_pred), 4)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 4)
    elapsed_time = round(time.time() - start_time, 2)
    
    print('Mean Absolute Error:', mae)  
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('Elapsed time: ', elapsed_time)
    print("*****"*20)
    
    # store results
    result = dict(
        n=i+1,
        columns=len(X_train_typos.columns),
        elapsed_time=elapsed_time,
        results=dict(mae=mae, mse=mse, rmse=rmse)
    )
    results.append(result)
    i += 1

[OK] Experiment 1: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 2123 rows
Mean Absolute Error: 0.3247
Mean Squared Error: 1.8665
Root Mean Squared Error: 1.3662
Elapsed time:  161.53
****************************************************************************************************
[OK] Experiment 2: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 4143 rows
Mean Absolute Error: 0.3253
Mean Squared Error: 1.8694
Root Mean Squared Error: 1.3672
Elapsed time:  165.75
****************************************************************************************************
[OK] Experiment 3: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 6184 rows
Mean Absolute Error: 0.3266
Mean Squared Error: 1.8798
Root Mean Squared Error: 1.3711
Elapsed time:  169.84
****************************************************************************************************
[OK] Experiment 4: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 8221 rows
Mean Absolute Error: 0.3264
Mean Squared Error

In [23]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)

[   {   'columns': 25310,
        'elapsed_time': 161.53,
        'n': 1,
        'results': {'mae': 0.3247, 'mse': 1.8665, 'rmse': 1.3662}},
    {   'columns': 25310,
        'elapsed_time': 165.75,
        'n': 2,
        'results': {'mae': 0.3253, 'mse': 1.8694, 'rmse': 1.3672}},
    {   'columns': 25310,
        'elapsed_time': 169.84,
        'n': 3,
        'results': {'mae': 0.3266, 'mse': 1.8798, 'rmse': 1.3711}},
    {   'columns': 25310,
        'elapsed_time': 175.63,
        'n': 4,
        'results': {'mae': 0.3264, 'mse': 1.8783, 'rmse': 1.3705}},
    {   'columns': 25310,
        'elapsed_time': 180.32,
        'n': 5,
        'results': {'mae': 0.3281, 'mse': 1.9032, 'rmse': 1.3796}}]
