<h1> TEST - Predictor typos in US - MULTI </h1>

* <b>Retrieves incrementally dataframes with n% of typos from <i>typos_generator</i></b>
* <b>In each case, executes one hot encoding of categorical variables</b>
* <b>Retrieves base MSE of data with n-typos</b>
* <b>Stores metrics for further statistics</b>

In [53]:
# imports
import copy
import pandas as pd 
import numpy as np
from sklearn import metrics  
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder
import random
import string
from collections import OrderedDict
import time

import import_ipynb
import typos_generator as typos_generator

%matplotlib inline

# Data Split

In [54]:
# Load the Wine Dataset
df_wine = pd.read_csv("df_wine_horizontal_tests.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()
# check data
df_wine.head(4)

Unnamed: 0,index,country,province,region,price,variety,points,year_of_wine
0,0,US,Oregon,Willamette Valley,14.0,Pinot Gris,87,2013
1,1,US,Michigan,Lake Michigan Shore,13.0,Riesling,87,2013
2,2,US,Oregon,Willamette Valley,65.0,Pinot Noir,87,2012
3,3,Spain,Northern Spain,Navarra,15.0,Tempranillo-Merlot,87,2011


In [55]:
# drop unnecessary columns
# df_wine = df_wine.drop(['province', 'region', 'price', 'variety', 'year_of_wine'], axis=1)
# df_wine.head(4)

In [56]:
df_X = df_wine.drop('points', axis=1)
df_Y = df_wine[['points']]
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.3, random_state=0) 

In [57]:
X_test_copy = copy.deepcopy(X_test)
X_test_copy.head(4)


Unnamed: 0,index,country,province,region,price,variety,year_of_wine
96588,100226,France,Alsace,Alsace,48.0,Gewürztraminer,2014
29171,30266,Italy,Northeastern Italy,Collio,11.0,Friulano,2013
32367,33636,US,California,California Other,9.0,Merlot,2014
78874,81896,US,Oregon,Southern Oregon,24.0,Syrah,2009


In [58]:
# check number of US values in X_train
frequencies = X_test_copy["country"].value_counts()
frequencies.sort_values
print(frequencies)

US           15834
France        4907
Italy         4896
Spain         1832
Argentina     1101
Australia      655
Canada          86
Name: country, dtype: int64


# One hot encoding (constant)

In [59]:
X_train = pd.get_dummies(X_train, columns=['country'])
X_train = pd.get_dummies(X_train, columns=['province'])
X_train = pd.get_dummies(X_train, columns=['region'])
X_train = pd.get_dummies(X_train, columns=['variety'])

# Setting: number of experinents

In [60]:
n = [10, 20, 30, 40, 50]

# Results Typos in Data

In [61]:
i = 0
results = list()


typos = list()

for X_test_typos in typos_generator.generate_dirty_data(X_test_copy, "country", "US", n):
    # store data with typos for further cleaning
    typos.append(X_test_typos)
    
    print("[OK] Experiment {}: ".format(i+1))
    start_time = time.time()
    # get hyperparameters
    # max_depth, min_samples_split = get_hyperparameters(X_train, y_train)
    # print("max_depth: {}, min_samples_split: {}".format(max_depth, min_samples_split))
    
    # one hot encoding (variable)
    X_test_typos = pd.get_dummies(X_test_typos, columns=['country'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['province'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['region'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['variety'])
    
    X_test_typos, X_train = X_test_typos.align(X_train, join='outer', axis=1, fill_value=0)
    
    # print(X_test_typos.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # apply regression
    regressor = DecisionTreeRegressor()  
    regressor.fit(X_train, y_train) # Train the model using the training sets
    y_pred = regressor.predict(X_test_typos) # Make predictions using the testing set

    # The evaluation metrics
    mae = round(metrics.mean_absolute_error(y_test, y_pred), 4)
    mse = round(metrics.mean_squared_error(y_test, y_pred), 4)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 4)
    elapsed_time = round(time.time() - start_time, 2)
    
    print('Mean Absolute Error:', mae)  
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('Elapsed time: ', elapsed_time)
    print("*****"*20)
    
    # store results
    result = dict(
        n=i+1,
        columns=len(X_test_typos.columns),
        elapsed_time=elapsed_time,
        results=dict(mae=mae, mse=mse, rmse=rmse)
    )
    results.append(result)
    i += 1

****************************************************************************************************
[OK] Inserting typos in 1583 out of 15834 rows in US. (10%)
[OK] Experiment 1: 
Mean Absolute Error: 2.1889
Mean Squared Error: 8.7739
Root Mean Squared Error: 2.9621
Elapsed time:  15.76
****************************************************************************************************
[OK] Inserting typos in 3166 out of 15834 rows in US. (20%)
[OK] Experiment 2: 
Mean Absolute Error: 2.1862
Mean Squared Error: 8.751
Root Mean Squared Error: 2.9582
Elapsed time:  25.48
****************************************************************************************************
[OK] Inserting typos in 4750 out of 15834 rows in US. (30%)
[OK] Experiment 3: 
Mean Absolute Error: 2.1822
Mean Squared Error: 8.7085
Root Mean Squared Error: 2.951
Elapsed time:  41.99
****************************************************************************************************
[OK] Inserting typos in 6333 out o

In [62]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)

[   {   'columns': 2946,
        'elapsed_time': 15.76,
        'n': 1,
        'results': {'mae': 2.1889, 'mse': 8.7739, 'rmse': 2.9621}},
    {   'columns': 5349,
        'elapsed_time': 25.48,
        'n': 2,
        'results': {'mae': 2.1862, 'mse': 8.751, 'rmse': 2.9582}},
    {   'columns': 8482,
        'elapsed_time': 41.99,
        'n': 3,
        'results': {'mae': 2.1822, 'mse': 8.7085, 'rmse': 2.951}},
    {   'columns': 12025,
        'elapsed_time': 72.06,
        'n': 4,
        'results': {'mae': 2.1789, 'mse': 8.6735, 'rmse': 2.9451}},
    {   'columns': 15751,
        'elapsed_time': 102.72,
        'n': 5,
        'results': {'mae': 2.1852, 'mse': 8.7003, 'rmse': 2.9496}}]


<h1> Predictor cleaned typos in country </h1>

* <b>Iterates through dataframes with n% of typos (previously stored)</b>
* <b>In each case, detects and clean typos</b>
* <b>In each case, executes one hot encoding of categorical variables</b>
* <b>Retrieves base MSE of data with n-typos</b>
* <b>Stores metrics for further statistics</b>

# Typos cleaner

In [63]:
def clean_value(value, original, percent):
    typos_chars = list(value)
    items = list(original)
    
    check_list = dict()
    for item in items:
        check_list[item] = False
    
    # check if value contains all chars of original
    for typo in typos_chars:
        if typo in original:
            if check_list[typo]:
                continue
            check_list[typo] = True
    
    # how many characters from the original were found, 
    # correctnes 100 means, all characters in the original were found
    correctness = int((percent * len(original)) / 100.0)
    corrected = len([x for x in check_list.values() if x])

    if corrected < correctness:
    # print("[ERROR] Couldnt clean: {}.".format(value))
        return value

    return original

# Typos detection

In [64]:
def detect_typos(df, column_name, column_value, expected_distinct_values, correctness):
    i = 0
    b = 0
    print("[OK] Clean correctness: {}%".format(correctness))
    for index, row in df.iterrows():
        value = row[column_name]
        if value in expected_distinct_values:
            continue
        
        new_value = clean_value(value, column_value, correctness)
        
        if value != new_value:
            df.at[index, column_name] = new_value
            # print("[OK] Cleaning {} results in {}".format(value, new_value))
            i += 1
        else:
            # discard the row that couldnt be cleaned
            # df.at[index, column_name] = np.NaN
            b += 1
    
    # print("[OK] Typos detected: {}".format(b+i))
    # print("[OK] Cleaned: {} rows".format(i))
    print("[WARN] Couldn't clean {} rows".format(b)) 
    return df

# Result CLEANED typos

In [65]:
# data cleaning settings
expected_distinct_values = ['US', 'Spain', 'Italy', 'France', 'Argentina', 'Australia', 'Canada']
correctness = 100

i = 0
results = list()


for X_test_typos in typos:
    print("[OK] Experiment {}: ".format(i+1))
    start_time = time.time()
    
    # data detect and clean
    X_test_typos = detect_typos(X_test_typos, "country", "US", expected_distinct_values, correctness)
    
    # one hot encoding (variable)
    X_test_typos = pd.get_dummies(X_test_typos, columns=['country'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['province'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['region'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['variety'])
    
    X_test_typos, X_train = X_test_typos.align(X_train, join='outer', axis=1, fill_value=0)
    
    # print(X_test_typos.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # apply regression
    regressor = DecisionTreeRegressor()  
    regressor.fit(X_train, y_train) # Train the model using the training sets
    y_pred = regressor.predict(X_test_typos) # Make predictions using the testing set

    # The evaluation metrics
    mae = round(metrics.mean_absolute_error(y_test, y_pred), 4)
    mse = round(metrics.mean_squared_error(y_test, y_pred), 4)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 4)
    elapsed_time = round(time.time() - start_time, 2)
    
    print('Mean Absolute Error:', mae)  
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('Elapsed time: ', elapsed_time)
    print("*****"*20)
    
    # store results
    result = dict(
        n=i+1,
        columns=len(X_test_typos.columns),
        elapsed_time=elapsed_time,
        results=dict(mae=mae, mse=mse, rmse=rmse)
    )
    results.append(result)
    i += 1

[OK] Experiment 1: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 872 rows
Mean Absolute Error: 2.1884
Mean Squared Error: 8.7892
Root Mean Squared Error: 2.9647
Elapsed time:  77.59
****************************************************************************************************
[OK] Experiment 2: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 1764 rows
Mean Absolute Error: 2.1854
Mean Squared Error: 8.7729
Root Mean Squared Error: 2.9619
Elapsed time:  77.55
****************************************************************************************************
[OK] Experiment 3: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 2673 rows
Mean Absolute Error: 2.1843
Mean Squared Error: 8.7551
Root Mean Squared Error: 2.9589
Elapsed time:  75.54
****************************************************************************************************
[OK] Experiment 4: 
[OK] Clean correctness: 100%
[WARN] Couldn't clean 3458 rows
Mean Absolute Error: 2.1809
Mean Squared Error: 8.

In [66]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)

[   {   'columns': 15751,
        'elapsed_time': 77.59,
        'n': 1,
        'results': {'mae': 2.1884, 'mse': 8.7892, 'rmse': 2.9647}},
    {   'columns': 15751,
        'elapsed_time': 77.55,
        'n': 2,
        'results': {'mae': 2.1854, 'mse': 8.7729, 'rmse': 2.9619}},
    {   'columns': 15751,
        'elapsed_time': 75.54,
        'n': 3,
        'results': {'mae': 2.1843, 'mse': 8.7551, 'rmse': 2.9589}},
    {   'columns': 15751,
        'elapsed_time': 79.24,
        'n': 4,
        'results': {'mae': 2.1809, 'mse': 8.7371, 'rmse': 2.9559}},
    {   'columns': 15751,
        'elapsed_time': 76.89,
        'n': 5,
        'results': {'mae': 2.1833, 'mse': 8.7538, 'rmse': 2.9587}}]
