<h1> Typos Generator </h1>

* <b>Reads ground truth db (<i>df_wine_horizontal_tests.csv</i>)</b>
* <b> Checks most influential country based on frequency</b>
* <b>Incrementally generates typos from 1 to 100% of data</b>
* <b>In each case, stores a new csv file with n typos</b>

In [206]:
# imports
import copy
import time
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import linear_model
import pandas as pd 
import numpy as np
from sklearn import metrics  
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder
import random
import string

%matplotlib inline

# Data loader

In [207]:
# Load the Wine Dataset
df_wine = pd.read_csv("df_wine_horizontal_tests.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()
# check data
df_wine.head(4)

Unnamed: 0,index,country,province,region,price,variety,points,year_of_wine
0,0,US,Oregon,Willamette Valley,14.0,Pinot Gris,87,2013
1,1,US,Michigan,Lake Michigan Shore,13.0,Riesling,87,2013
2,2,US,Oregon,Willamette Valley,65.0,Pinot Noir,87,2012
3,3,Spain,Northern Spain,Navarra,15.0,Tempranillo-Merlot,87,2011


# Influence checker

In [208]:
# check distinct countries
countries = pd.unique(df_wine['country'])
print(countries)

['US' 'Spain' 'Italy' 'France' 'Argentina' 'Australia' 'Canada']


In [209]:
# todo: check frequencies of country
frequencies = df_wine['country'].value_counts()
frequencies.sort_values
print(frequencies)

US           53116
France       16319
Italy        16069
Spain         6059
Argentina     3664
Australia     2223
Canada         253
Name: country, dtype: int64


In [210]:
name = frequencies.index[0]
value = frequencies[0]
print("Most frequent: {name} ({value})".format(name=name, value=value))

Most frequent: US (53116)


# Typos generator

In [211]:
def randomize(value, n):
    # makes harder to match the original value replacing n of the original chars
    letters = string.ascii_letters
    indexes = range(0, len(value))
    indexes = random.sample(indexes, n)
    for index in indexes:
        temp = list(value)
        temp[index] = random.choice(letters)
        value = ''.join(temp)
    return value

In [212]:
def generate_random_typo(value):
    letters = string.ascii_letters
    degree = random.randint(1, len(value)) # add at least 1 typo
    indexes = range(0, len(value))
    indexes = random.sample(indexes, degree)

    for index in indexes:
        temp = list(value)
        temp[index] = temp[index] + random.choice(letters)
        value = ''.join(temp)

    return randomize(value, 1)
        

In [213]:
def insert_typos(df, column_name, column_value, n, percent, size):
    print("[OK] Inserting typos in {} out of {} rows in {}. ({}%)".format(n, size, column_value, percent))
    
    ocurrence_indexes = df.index[df[column_name] == column_value].tolist()
    
    # select random elements 
    selected_indexes = random.sample(ocurrence_indexes, n)
    for index, row in df.iterrows():
        if index in selected_indexes:
            value = generate_random_typo(column_value)
            # print("[OK] Generated typo...", value)
            df.at[index, column_name] = value
    return df

In [214]:
# attempt to incrementally include typos in all US occurrences
# stores (n, time) tuples
metrics = list() 
n = 100 # percentage
temp = copy.deepcopy(df_wine)
for i in range(1, n+1):
    print("*****"*20)
    start_time = time.time()
    # Generates n typos in the selected country name
    aux = copy.deepcopy(temp)
    # increment nr of typos by 1%
    rows_affected = int((i * value) / 100.0)
    df_wine_typos = insert_typos(aux, "country", "US", rows_affected, i, value)
    # save typos in a csv to clean them after
    df_wine_typos.to_csv('df_wine_country_typos_{}.csv'.format(i))
    # check results
    # frequencies = df_wine_typos['country'].value_counts()
    # frequencies.sort_values
    elapsed_time = time.time() - start_time
    # print(frequencies)
    print("Elapsed time: {} sec".format(int(elapsed_time)))
    print("*****"*20)
    metrics.append((i, int(elapsed_time)))
    
print("[OK] Finished")

****************************************************************************************************
[OK] Inserting typos in 531 out of 53116 rows in US. (1%)
Elapsed time: 21 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 1062 out of 53116 rows in US. (2%)
Elapsed time: 33 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 1593 out of 53116 rows in US. (3%)
Elapsed time: 41 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 2124 out of 53116 rows in US. 

Elapsed time: 156 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 16465 out of 53116 rows in US. (31%)
Elapsed time: 149 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 16997 out of 53116 rows in US. (32%)
Elapsed time: 159 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 17528 out of 53116 rows in US. (33%)
Elapsed time: 147 sec
****************************************************************************************************
**********************

KeyboardInterrupt: 

In [None]:
print(metrics)