<h1> Predictor typos in most influential country </h1>

* <b>Retrieves incrementally dataframes with n% of typos from <i>typos_generator</i></b>
* <b>In each case, executes one hot encoding of categorical variables</b>
* <b>Retrieves base MSE of data with n-typos</b>
* <b>Stores metrics for further statistics</b>

In [24]:
# imports
import copy
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import linear_model
import pandas as pd 
import numpy as np
from sklearn import metrics  
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder
import random
import string
from collections import OrderedDict
import time


import import_ipynb
import typos_generator as typos_generator

%matplotlib inline

# Data Split

In [25]:
# Load the Wine Dataset
df_wine = pd.read_csv("df_wine_horizontal_tests.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()
# check data
df_wine.head(4)

Unnamed: 0,index,country,province,region,price,variety,points,year_of_wine
0,0,US,Oregon,Willamette Valley,14.0,Pinot Gris,87,2013
1,1,US,Michigan,Lake Michigan Shore,13.0,Riesling,87,2013
2,2,US,Oregon,Willamette Valley,65.0,Pinot Noir,87,2012
3,3,Spain,Northern Spain,Navarra,15.0,Tempranillo-Merlot,87,2011


In [26]:
# drop unnecessary columns
# df_wine = df_wine.drop(['province', 'region', 'price', 'variety', 'year_of_wine'], axis=1)
# df_wine.head(4)

In [27]:
df_X = df_wine.drop('points', axis=1)
df_Y = df_wine[['points']]
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.3, random_state=0) 

In [28]:
X_train_copy = copy.deepcopy(X_train)
X_train_copy.head(4)


Unnamed: 0,index,country,province,region,price,variety,year_of_wine
82970,86163,US,California,Napa,100.0,Cabernet Sauvignon,2013
4606,4766,US,New York,Long Island,40.0,Sauvignon Blanc,2013
27058,28094,France,Burgundy,Saint-Véran,30.0,Chardonnay,2014
84009,87245,France,Bordeaux,Blaye Côtes de Bordeaux,15.0,Bordeaux-style Red Blend,2016


In [29]:
# check number of US values in X_train
frequencies = X_train_copy["country"].value_counts()
frequencies.sort_values
print(frequencies)

US           37282
France       11412
Italy        11173
Spain         4227
Argentina     2563
Australia     1568
Canada         167
Name: country, dtype: int64


# One hot encoding (constant)

In [30]:
X_test = pd.get_dummies(X_test, columns=['country'])
X_test = pd.get_dummies(X_test, columns=['province'])
X_test = pd.get_dummies(X_test, columns=['region'])
X_test = pd.get_dummies(X_test, columns=['variety'])

# Results

In [None]:
n = 100
i = 0
results = list()

for X_train_typos in typos_generator.generate_dirty_data(X_train_copy, "country", "US", n):
    print("[OK] Experiment {}: ".format(i+1))
    start_time = time.time()
    # get hyperparameters
    # max_depth, min_samples_split = get_hyperparameters(X_train, y_train)
    # print("max_depth: {}, min_samples_split: {}".format(max_depth, min_samples_split))
    
    # one hot encoding (variable)
    X_train_typos = pd.get_dummies(X_train_typos, columns=['country'])
    X_train_typos = pd.get_dummies(X_train_typos, columns=['province'])
    X_train_typos = pd.get_dummies(X_train_typos, columns=['region'])
    X_train_typos = pd.get_dummies(X_train_typos, columns=['variety'])
    
    X_train_typos, X_test = X_train_typos.align(X_test, join='outer', axis=1, fill_value=0)
    
    # print(X_train_typos.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # apply regression
    regressor = DecisionTreeRegressor()  
    regressor.fit(X_train_typos, y_train) # Train the model using the training sets
    y_pred = regressor.predict(X_test) # Make predictions using the testing set

    # The evaluation metrics
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    elapsed_time = round(time.time() - start_time, 2)
    
    print('Mean Absolute Error:', round(mae, 3))  
    print('Mean Squared Error:', round(mse, 3))  
    print('Root Mean Squared Error:', round(rmse, 3)) 
    print('Elapsed time: ', elapsed_time)
    print("*****"*20)
    
    # store results
    result = dict(
        n=i,
        columns=len(X_train_typos.columns),
        elapsed_time=elapsed_time,
        results=dict(mae=mae, mse=mse, rmse=rmse)
    )
    results.append(result)
    i += 1

[OK] Inserting typos in 372 out of 37282 rows in US. (1%)


In [None]:
def get_hyperparameters(X_train, y_train):
    regressor3 = DecisionTreeRegressor()
    param_dist = {'max_depth': sp_randint(2,16),
                  'min_samples_split': sp_randint(2,16)}

    n_iter_search = 20
    clfrs = RandomizedSearchCV(regressor3, 
                               param_distributions=param_dist,
                               scoring='neg_mean_squared_error',
                               cv=5 , n_jobs=1, verbose=1,
                               n_iter=n_iter_search)
    clfrs.fit(X_train, y_train)
    params = clfrs.best_params_
    return params["max_depth"], params["min_samples_split"]

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)