<h1> TEST - Predictor typos in Argentina - MULTI </h1>

* <b>Retrieves incrementally dataframes with n% of typos from <i>typos_generator</i></b>
* <b>In each case, executes one hot encoding of categorical variables</b>
* <b>Retrieves base MSE of data with n-typos</b>
* <b>Stores metrics for further statistics</b>

In [None]:
# imports
import copy
import pandas as pd 
import numpy as np
from sklearn import metrics  
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder
import random
import string
from collections import OrderedDict
import time

import import_ipynb
import typos_generator as typos_generator

%matplotlib inline

# Data Split

In [None]:
# Load the Wine Dataset
df_wine = pd.read_csv("df_wine_horizontal_tests.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()
# check data
df_wine.head(4)

In [None]:
# drop unnecessary columns
# df_wine = df_wine.drop(['province', 'region', 'price', 'variety', 'year_of_wine'], axis=1)
# df_wine.head(4)

In [None]:
df_X = df_wine.drop('points', axis=1)
df_Y = df_wine[['points']]
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.3, random_state=0) 

In [None]:
X_test_copy = copy.deepcopy(X_test)
X_test_copy.head(4)


In [None]:
# check number of US values in X_train
frequencies = X_test_copy["country"].value_counts()
frequencies.sort_values
print(frequencies)

# One hot encoding (constant)

In [None]:
X_train = pd.get_dummies(X_train, columns=['country'])
X_train = pd.get_dummies(X_train, columns=['province'])
X_train = pd.get_dummies(X_train, columns=['region'])
X_train = pd.get_dummies(X_train, columns=['variety'])

# Setting: number of experinents

In [None]:
n = [10, 20, 30, 40, 50]

# Results Typos in Data

In [None]:
i = 0
results = list()


typos = list()

for X_test_typos in typos_generator.generate_dirty_data(X_test_copy, "country", "Argentina", n):
    # store data with typos for further cleaning
    typos.append(X_test_typos)
    
    print("[OK] Experiment {}: ".format(i+1))
    start_time = time.time()
    # get hyperparameters
    # max_depth, min_samples_split = get_hyperparameters(X_train, y_train)
    # print("max_depth: {}, min_samples_split: {}".format(max_depth, min_samples_split))
    
    # one hot encoding (variable)
    X_test_typos = pd.get_dummies(X_test_typos, columns=['country'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['province'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['region'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['variety'])
    
    X_test_typos, X_train = X_test_typos.align(X_train, join='outer', axis=1, fill_value=0)
    
    # print(X_test_typos.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # apply regression
    regressor = DecisionTreeRegressor()  
    regressor.fit(X_train, y_train) # Train the model using the training sets
    y_pred = regressor.predict(X_test_typos) # Make predictions using the testing set

    # The evaluation metrics
    mae = round(metrics.mean_absolute_error(y_test, y_pred), 4)
    mse = round(metrics.mean_squared_error(y_test, y_pred), 4)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 4)
    elapsed_time = round(time.time() - start_time, 2)
    
    print('Mean Absolute Error:', mae)  
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('Elapsed time: ', elapsed_time)
    print("*****"*20)
    
    # store results
    result = dict(
        n=i+1,
        columns=len(X_test_typos.columns),
        elapsed_time=elapsed_time,
        results=dict(mae=mae, mse=mse, rmse=rmse)
    )
    results.append(result)
    i += 1

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)

<h1> Predictor cleaned typos in country </h1>

* <b>Iterates through dataframes with n% of typos (previously stored)</b>
* <b>In each case, detects and clean typos</b>
* <b>In each case, executes one hot encoding of categorical variables</b>
* <b>Retrieves base MSE of data with n-typos</b>
* <b>Stores metrics for further statistics</b>

# Typos cleaner

In [None]:
def clean_value(value, original, percent):
    typos_chars = list(value)
    items = list(original)
    
    check_list = dict()
    for item in items:
        check_list[item] = False
    
    # check if value contains all chars of original
    for typo in typos_chars:
        if typo in original:
            if check_list[typo]:
                continue
            check_list[typo] = True
    
    # how many characters from the original were found, 
    # correctnes 100 means, all characters in the original were found
    correctness = int((percent * len(original)) / 100.0)
    corrected = len([x for x in check_list.values() if x])

    if corrected < correctness:
    # print("[ERROR] Couldnt clean: {}.".format(value))
        return value

    return original

# Typos detection

In [None]:
def detect_typos(df, column_name, column_value, expected_distinct_values, correctness):
    i = 0
    b = 0
    print("[OK] Clean correctness: {}%".format(correctness))
    for index, row in df.iterrows():
        value = row[column_name]
        if value in expected_distinct_values:
            continue
        
        new_value = clean_value(value, column_value, correctness)
        
        if value != new_value:
            df.at[index, column_name] = new_value
            # print("[OK] Cleaning {} results in {}".format(value, new_value))
            i += 1
        else:
            # discard the row that couldnt be cleaned
            # df.at[index, column_name] = np.NaN
            b += 1
    
    # print("[OK] Typos detected: {}".format(b+i))
    # print("[OK] Cleaned: {} rows".format(i))
    print("[WARN] Couldn't clean {} rows".format(b)) 
    return df

# Result CLEANED typos

In [None]:
# data cleaning settings
expected_distinct_values = ['US', 'Spain', 'Italy', 'France', 'Argentina', 'Australia', 'Canada']
correctness = 100

i = 0
results = list()


for X_test_typos in typos:
    print("[OK] Experiment {}: ".format(i+1))
    start_time = time.time()
    
    # data detect and clean
    X_test_typos = detect_typos(X_test_typos, "country", "Argentina", expected_distinct_values, correctness)
    
    # one hot encoding (variable)
    X_test_typos = pd.get_dummies(X_test_typos, columns=['country'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['province'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['region'])
    X_test_typos = pd.get_dummies(X_test_typos, columns=['variety'])
    
    X_test_typos, X_train = X_test_typos.align(X_train, join='outer', axis=1, fill_value=0)
    
    # print(X_test_typos.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # apply regression
    regressor = DecisionTreeRegressor()  
    regressor.fit(X_train, y_train) # Train the model using the training sets
    y_pred = regressor.predict(X_test_typos) # Make predictions using the testing set

    # The evaluation metrics
    mae = round(metrics.mean_absolute_error(y_test, y_pred), 4)
    mse = round(metrics.mean_squared_error(y_test, y_pred), 4)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 4)
    elapsed_time = round(time.time() - start_time, 2)
    
    print('Mean Absolute Error:', mae)  
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('Elapsed time: ', elapsed_time)
    print("*****"*20)
    
    # store results
    result = dict(
        n=i+1,
        columns=len(X_test_typos.columns),
        elapsed_time=elapsed_time,
        results=dict(mae=mae, mse=mse, rmse=rmse)
    )
    results.append(result)
    i += 1

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)