<h1> Typos Generator </h1>

* <b>Reads ground truth db (<i>df_wine_horizontal_tests.csv</i>)</b>
* <b> Checks most influential country based on frequency</b>
* <b>Incrementally generates typos from 1 to 100% of data</b>
* <b>In each case, stores a new csv file with n typos</b>

In [1]:
# imports
import copy
import time
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import linear_model
import pandas as pd 
import numpy as np
from sklearn import metrics  
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder
import random
import string

%matplotlib inline

# Data loader

In [2]:
# Load the Wine Dataset
df_wine = pd.read_csv("df_wine_horizontal_tests.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()
# check data
df_wine.head(4)

Unnamed: 0,index,country,province,region,price,variety,points,year_of_wine
0,0,US,Oregon,Willamette Valley,14.0,Pinot Gris,87,2013
1,1,US,Michigan,Lake Michigan Shore,13.0,Riesling,87,2013
2,2,US,Oregon,Willamette Valley,65.0,Pinot Noir,87,2012
3,3,Spain,Northern Spain,Navarra,15.0,Tempranillo-Merlot,87,2011


# Influence checker

In [3]:
# check distinct countries
countries = pd.unique(df_wine['country'])
print(countries)

['US' 'Spain' 'Italy' 'France' 'Argentina' 'Australia' 'Canada']


In [4]:
# todo: check frequencies of country
frequencies = df_wine['country'].value_counts()
frequencies.sort_values
print(frequencies)

US           53116
France       16319
Italy        16069
Spain         6059
Argentina     3664
Australia     2223
Canada         253
Name: country, dtype: int64


In [5]:
name = frequencies.index[0]
value = frequencies[0]
print("Most frequent: {name} ({value})".format(name=name, value=value))

Most frequent: US (53116)


# Typos generator

In [6]:
def randomize(value, n):
    # makes harder to match the original value replacing n of the original chars
    letters = string.ascii_letters
    indexes = range(0, len(value))
    indexes = random.sample(indexes, n)
    for index in indexes:
        temp = list(value)
        temp[index] = random.choice(letters)
        value = ''.join(temp)
    return value

In [7]:
def generate_random_typo(value):
    letters = string.ascii_letters
    degree = random.randint(1, len(value)) # add at least 1 typo
    indexes = range(0, len(value))
    indexes = random.sample(indexes, degree)

    for index in indexes:
        temp = list(value)
        temp[index] = temp[index] + random.choice(letters)
        value = ''.join(temp)

    return randomize(value, 1)
        

In [8]:
def insert_typos(df, column_name, column_value, n, percent, size):
    print("[OK] Inserting typos in {} out of {} rows in {}. ({}%)".format(n, size, column_value, percent))
    
    ocurrence_indexes = df.index[df[column_name] == column_value].tolist()
    
    # select random elements 
    selected_indexes = random.sample(ocurrence_indexes, n)
    for index, row in df.iterrows():
        if index in selected_indexes:
            value = generate_random_typo(column_value)
            # print("[OK] Generated typo...", value)
            df.at[index, column_name] = value
    return df

In [9]:
# attempt to incrementally include typos in all US occurrences
# stores (n, time) tuples
metrics = list() 
n = 100 # percentage
temp = copy.deepcopy(df_wine)
for i in range(1, n+1):
    print("*****"*20)
    start_time = time.time()
    # Generates n typos in the selected country name
    aux = copy.deepcopy(temp)
    # increment nr of typos by 1%
    rows_affected = int((i * value) / 100.0)
    df_wine_typos = insert_typos(aux, "country", "US", rows_affected, i, value)
    # save typos in a csv to clean them after
    df_wine_typos.to_csv('df_wine_country_typos_{}.csv'.format(i))
    # check results
    # frequencies = df_wine_typos['country'].value_counts()
    # frequencies.sort_values
    elapsed_time = time.time() - start_time
    # print(frequencies)
    print("Elapsed time: {} sec".format(int(elapsed_time)))
    print("*****"*20)
    metrics.append((i, int(elapsed_time)))
    
print("[OK] Finished")

****************************************************************************************************
[OK] Inserting typos in 531 out of 53116 rows in US. (1%)
Elapsed time: 6 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 1062 out of 53116 rows in US. (2%)
Elapsed time: 7 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 1593 out of 53116 rows in US. (3%)
Elapsed time: 8 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 2124 out of 53116 rows in US. (4%

Elapsed time: 31 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 16465 out of 53116 rows in US. (31%)
Elapsed time: 32 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 16997 out of 53116 rows in US. (32%)
Elapsed time: 32 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 17528 out of 53116 rows in US. (33%)
Elapsed time: 33 sec
****************************************************************************************************
**************************

Elapsed time: 60 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 31869 out of 53116 rows in US. (60%)
Elapsed time: 65 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 32400 out of 53116 rows in US. (61%)
Elapsed time: 56 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 32931 out of 53116 rows in US. (62%)
Elapsed time: 54 sec
****************************************************************************************************
**************************

Elapsed time: 84 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 47273 out of 53116 rows in US. (89%)
Elapsed time: 87 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 47804 out of 53116 rows in US. (90%)
Elapsed time: 170 sec
****************************************************************************************************
****************************************************************************************************
[OK] Inserting typos in 48335 out of 53116 rows in US. (91%)
Elapsed time: 93 sec
****************************************************************************************************
*************************

In [10]:
print(metrics)

[(1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), (7, 12), (8, 12), (9, 13), (10, 14), (11, 15), (12, 16), (13, 17), (14, 18), (15, 19), (16, 20), (17, 20), (18, 21), (19, 22), (20, 23), (21, 24), (22, 25), (23, 25), (24, 26), (25, 27), (26, 28), (27, 29), (28, 29), (29, 31), (30, 31), (31, 32), (32, 32), (33, 33), (34, 34), (35, 37), (36, 39), (37, 42), (38, 40), (39, 38), (40, 39), (41, 40), (42, 40), (43, 41), (44, 42), (45, 43), (46, 46), (47, 49), (48, 45), (49, 50), (50, 50), (51, 52), (52, 50), (53, 55), (54, 57), (55, 56), (56, 54), (57, 61), (58, 61), (59, 60), (60, 65), (61, 56), (62, 54), (63, 55), (64, 55), (65, 56), (66, 63), (67, 64), (68, 65), (69, 68), (70, 67), (71, 68), (72, 65), (73, 65), (74, 67), (75, 73), (76, 68), (77, 72), (78, 71), (79, 70), (80, 75), (81, 73), (82, 71), (83, 73), (84, 252), (85, 77), (86, 316), (87, 89), (88, 84), (89, 87), (90, 170), (91, 93), (92, 74), (93, 77), (94, 82), (95, 79), (96, 82), (97, 93), (98, 91), (99, 80), (100, 82)]
