<h1> Base Predictor </h1>

* <b>Reads ground truth db (<i>df_wine_horizontal_tests.csv</i>)</b>
* <b>Includes one hot encoding of categorical variables (e.g. country, variety)</b>
* <b>Calculates HPO</b>
* <b>Retrieves base MSE of clean data</b>

In [8]:
# imports
from copy import copy, deepcopy
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import linear_model
import pandas as pd 
import numpy as np
from sklearn import metrics  
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder


%matplotlib inline

In [9]:
# Load the Wine Dataset
df_wine = pd.read_csv("~/uni/WinePredictor/horizontal_typos/df_wine_horizontal_tests.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()
# check data
df_wine.head(4)

Unnamed: 0,index,country,province,region,price,variety,points,year_of_wine
0,0,US,Oregon,Willamette Valley,14.0,Pinot Gris,87,2013
1,1,US,Michigan,Lake Michigan Shore,13.0,Riesling,87,2013
2,2,US,Oregon,Willamette Valley,65.0,Pinot Noir,87,2012
3,3,Spain,Northern Spain,Navarra,15.0,Tempranillo-Merlot,87,2011


In [10]:
# check distinct countries
countries = pd.unique(df_wine['country'])
print(countries)

['US' 'Spain' 'Italy' 'France' 'Argentina' 'Australia' 'Canada']


In [11]:
frequencies = df_wine['country'].value_counts()
frequencies.sort_values
print(frequencies)

US           53116
France       16319
Italy        16069
Spain         6059
Argentina     3664
Australia     2223
Canada         253
Name: country, dtype: int64


In [12]:
name = frequencies.index[0]
value = frequencies[0]
print("Most frequent: {name} ({value})".format(name=name, value=value))

Most frequent: US (53116)


# One hot encoding

In [13]:
df_wine = pd.get_dummies(df_wine, columns=['country'])
df_wine = pd.get_dummies(df_wine, columns=['province'])
df_wine = pd.get_dummies(df_wine, columns=['region'])
df_wine = pd.get_dummies(df_wine, columns=['variety'])

In [14]:
df_wine.head(4)

Unnamed: 0,index,price,points,year_of_wine,country_Argentina,country_Australia,country_Canada,country_France,country_Italy,country_Spain,...,variety_Vitovska,variety_Viura,variety_Viura-Chardonnay,variety_Viura-Verdejo,variety_White Blend,variety_White Riesling,variety_Xarel-lo,variety_Zibibbo,variety_Zinfandel,variety_Zweigelt
0,0,14.0,87,2013,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,13.0,87,2013,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,65.0,87,2012,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,15.0,87,2011,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# check new headers
headers = list(df_wine)
for header in headers:
    if "country_" in header:
        print(header)

country_Argentina
country_Australia
country_Canada
country_France
country_Italy
country_Spain
country_US


In [16]:
# validate number of wines from US, should be 53116
frequencies_us = df_wine['country_US'].value_counts()
if frequencies_us[1] == value:
    print("[OK] Sizes match after hot encoding.")

[OK] Sizes match after hot encoding.


In [17]:
# split data
df_X = df_wine.drop('points', axis=1)
df_Y = df_wine[['points']]
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.3, random_state=0) 

In [18]:
def get_hyperparameters(X_train, y_train):
    regressor3 = DecisionTreeRegressor()
    param_dist = {'max_depth': sp_randint(2,16),
                  'min_samples_split': sp_randint(2,16)}

    n_iter_search = 20
    clfrs = RandomizedSearchCV(regressor3, 
                               param_distributions=param_dist,
                               scoring='neg_mean_squared_error',
                               cv=5 , n_jobs=1, verbose=1,
                               n_iter=n_iter_search)
    clfrs.fit(X_train, y_train)
    params = clfrs.best_params_
    return params["max_depth"], params["min_samples_split"]

In [19]:
# get hyperparameters
max_depth, min_samples_split = get_hyperparameters(X_train, y_train)
print("max_depth: {}, min_samples_split: {}".format(max_depth, min_samples_split)) 

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  6.6min finished


max_depth: 9, min_samples_split: 15


# BASE RESULT

In [20]:
regressor = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split)  

# Train the model using the training sets
regressor.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regressor.predict(X_test)

mae = round(metrics.mean_absolute_error(y_test, y_pred), 4)
mse = round(metrics.mean_squared_error(y_test, y_pred), 4)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 4)

# The evaluation metrics
print('Mean Absolute Error:',  )  
print('Mean Squared Error:', )  
print('Root Mean Squared Error:', ) 

Mean Absolute Error:
Mean Squared Error:
Root Mean Squared Error:
