# Analyzing Model Efficiencies for Wine Quality 

#### Author - Tushar Jain | SID: 3032679496

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Graphing Libraries
import cufflinks as cf
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
cf.set_config_file(offline=True, sharing=False, theme='ggplot')


Bad key "text.kerning_factor" on line 4 in
/Users/tusharjain/opt/miniconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
# Regression Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, ElasticNet, Ridge, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from yellowbrick.regressor import AlphaSelection
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor.alphas import alphas

## Data Setup

In [4]:
# Importing DataSet
wine = pd.read_csv("winequality-red.csv", sep=';')
cols = wine.columns[0:11]
cols = cols.insert(0, 'intercept')
cols = cols.insert(0, 'model_type')
final_frame = pd.DataFrame(columns=cols)
errors = pd.DataFrame(columns=['model_type', 'test error'])
display(wine.head())
display(final_frame)
display(errors)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Unnamed: 0,model_type,intercept,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol


Unnamed: 0,model_type,test error


In [5]:
# Splitting into training and testing set
wine_train, wine_test = train_test_split(wine, test_size=0.2)
# Splitting into X_Matrix and Target Variable
x_train, y_train = wine_train.iloc[:, 0:11], wine_train.iloc[:, -1].to_frame()
x_test, y_test = wine_test.iloc[:, 0:11], wine_test.iloc[:, -1].to_frame()
x_train = preprocessing.normalize(x_train)
x_test = preprocessing.normalize(x_test)

In [6]:
def mse_score(model, X, y):
    return np.mean((y - model.predict(X))**2)

## Ordinary Least Squares 

In [7]:
ols = LinearRegression(fit_intercept=True)
ols.fit(x_train, y_train)
pred = ols.predict(x_test)
ols_error = mean_squared_error(y_test, pred)
err_df = pd.DataFrame([['OLS', ols_error]], columns=errors.columns[0:2])
errors = errors.append(err_df)
print('Test Error OLS: ', ols_error)

Test Error OLS:  0.4474853555333391


In [8]:
ols_param_df = pd.DataFrame(ols.intercept_, columns=['intercept'])\
    .merge(pd.DataFrame(ols.coef_, columns=wine.columns.values[0:11]), left_index=True, right_index=True)
ols_param_df['model_type'] = ['OLS']
final_frame = final_frame.append(ols_param_df)

## Ridge Regression

In [10]:
alphas = np.logspace(-2, 2, 10)
cv_values = []
for alpha in alphas:
    ridge_model = Ridge(alpha=alpha, fit_intercept=True)
    cv_values.append(np.mean(cross_val_score(
        ridge_model, x_train, y_train['quality'], scoring=mse_score, cv=5)))

In [11]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=alphas, y=cv_values,
                         mode="lines+markers", name="CV"))
fig.update_layout(title='RidgeCV Error',
                  xaxis_title="Alpha", yaxis_title="CV MSE")

In [12]:
best_alpha_ridge = alphas[np.argmin(cv_values)]
print('Best Alpha Value:', best_alpha_ridge)
min_cv_error_ridge = min(cv_values)
print('Min CV Error:', min_cv_error_ridge)
ridge_model = Ridge(alpha=best_alpha_ridge, fit_intercept=True)
ridge_model.fit(x_train, y_train['quality'])
pred = ridge_model.predict(x_test)
ridge_error = mean_squared_error(y_test, pred)
err_df = pd.DataFrame([['RIDGE', ridge_error]], columns=errors.columns[0:2])
errors = errors.append(err_df)
print('Test Error Ridge:', ridge_error)

Best Alpha Value: 0.01
Min CV Error: 0.426186047441006
Test Error Ridge: 0.45922324227742256


In [13]:
data = []
for i in ridge_model.coef_:
    data.append([i])
ridge_dict = dict(zip(wine.columns[0:11], data))
ridge_df = pd.DataFrame(ridge_dict)
ridge_df['intercept'] = [ridge_model.intercept_]
ridge_df['model_type'] = ['RIDGE']
final_frame = final_frame.append(ridge_df)

## Lasso Regression

In [14]:
alphas = np.logspace(-2, 2, 10)
cv_values = []
for alpha in alphas:
    lasso_model = Lasso(alpha=alpha, fit_intercept=True)
    cv_values.append(np.mean(cross_val_score(
        lasso_model, x_train, y_train['quality'], scoring=mse_score, cv=5)))

In [15]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=alphas, y=cv_values,
                         mode="lines+markers", name="CV"))
fig.update_layout(title='LassoCV Error',
                  xaxis_title="Alpha", yaxis_title="CV MSE")

In [16]:
best_alpha_lasso = alphas[np.argmin(cv_values)]
print('Best Alpha Value:', best_alpha_lasso)
min_cv_error_lasso = min(cv_values)
print('Min CV Error:', min_cv_error_lasso)
lasso_model = Lasso(alpha=best_alpha_lasso, fit_intercept=True)
lasso_model.fit(x_train, y_train['quality'])
pred = lasso_model.predict(x_test)
lasso_error = mean_squared_error(y_test, pred)
err_df = pd.DataFrame([['LASSO', lasso_error]], columns=errors.columns[0:2])
errors = errors.append(err_df)
print('Test Error Lasso:', lasso_error)

Best Alpha Value: 0.01
Min CV Error: 0.6170799520253795
Test Error Lasso: 0.6735943486511216


In [17]:
data = []
for i in lasso_model.coef_:
    data.append([i])
lasso_dict = dict(zip(wine.columns[0:11], data))
lasso_df = pd.DataFrame(lasso_dict)
lasso_df['intercept'] = [lasso_model.intercept_]
lasso_df['model_type'] = ['LASSO']
final_frame = final_frame.append(lasso_df)

## Elastic Net

In [18]:
alphas = np.logspace(-1, 1, 10)
ratio = np.arange(0, 1, 0.1)
cv_values = []
alpha_r_pair = []
alphals = []
rls = []
for alpha in alphas:
    for r in ratio:
        enet_model = ElasticNet(alpha=alpha, l1_ratio=r, fit_intercept=True)
        cv_values.append(np.mean(cross_val_score(
            enet_model, x_train, y_train['quality'], scoring=mse_score, cv=5)))
        alpha_r_pair.append((alpha, r))
        alphals.append(alpha)
        rls.append(r)

In [19]:
fig = px.scatter_3d(x=alphals, y=rls, z=cv_values)
fig.update_layout(title='ElasticNetCV Error', scene=dict(
    xaxis_title="Alpha",
    yaxis_title='L1 Ratio',
    zaxis_title='CV MSE'))
fig.show()

In [20]:
best_alpha_enet = alpha_r_pair[np.argmin(cv_values)][0]
best_r = alpha_r_pair[np.argmin(cv_values)][1]
print('Best Alpha Value:', best_alpha_enet)
print('Best L1 Ratio Value:', best_r)
min_cv_error_enet = min(cv_values)
print('Min CV Error:', min_cv_error_enet)
enet_model = ElasticNet(alpha=best_alpha_enet,
                        l1_ratio=best_r, fit_intercept=True)
enet_model.fit(x_train, y_train['quality'])
pred = lasso_model.predict(x_test)
enet_error = mean_squared_error(y_test, pred)
err_df = pd.DataFrame([['ELASTIC NET', enet_error]],
                      columns=errors.columns[0:2])
errors = errors.append(err_df)
print('Test Error Elastic Net:', enet_error)

Best Alpha Value: 0.1
Best L1 Ratio Value: 0.0
Min CV Error: 0.6237532721382059
Test Error Elastic Net: 0.6735943486511216


In [21]:
data = []
for i in enet_model.coef_:
    data.append([i])
enet_dict = dict(zip(wine.columns[0:11], data))
enet_df = pd.DataFrame(enet_dict)
enet_df['intercept'] = [enet_model.intercept_]
enet_df['model_type'] = ['ELASTIC NET']
final_frame = final_frame.append(enet_df)

## Results

### Model Coefficients

In [22]:
final_frame

Unnamed: 0,model_type,intercept,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,OLS,2.80762,1.89433,-36.423932,-15.215395,0.547111,-33.795372,1.447765,2.133576,-54.007053,-15.238384,36.040313,10.668204
0,RIDGE,2.655863,0.706754,-29.183986,-3.384139,0.310149,-6.677676,1.572868,2.235706,-9.19853,-23.637059,21.963111,10.149985
0,LASSO,5.391923,0.0,-0.0,0.0,0.0,-0.0,0.194674,-0.0,0.0,-0.0,0.0,0.612201
0,ELASTIC NET,5.622179,0.108703,-0.015597,0.014455,0.027724,-0.000415,0.105794,-0.126545,0.00542,0.013846,0.014019,0.16093


### Test Errors

In [23]:
errors

Unnamed: 0,model_type,test error
0,OLS,0.447485
0,RIDGE,0.459223
0,LASSO,0.673594
0,ELASTIC NET,0.673594


### Minimum Cross Validation Errors

In [24]:
models = ['RIDGE', 'LASSO', 'ELASTIC NET']
mincv_vals = [min_cv_error_ridge, min_cv_error_lasso, min_cv_error_enet]
mincv = pd.DataFrame({'Model type': models, 'Min CV error': mincv_vals})
mincv

Unnamed: 0,Model type,Min CV error
0,RIDGE,0.426186
1,LASSO,0.61708
2,ELASTIC NET,0.623753


### Optimal Tuning Parameters

In [25]:
models = ['RIDGE', 'LASSO', 'ELASTIC NET', 'ELASTIC NET']
params = ['alpha', 'alpha', 'alpha', 'L1 Ratio']
vals = [best_alpha_ridge, best_alpha_lasso, best_alpha_enet, best_r]
opt_param = pd.DataFrame(
    {'Model type': models, 'Hyperparameter': params, 'Optimal Value': vals})
opt_param

Unnamed: 0,Model type,Hyperparameter,Optimal Value
0,RIDGE,alpha,0.01
1,LASSO,alpha,0.01
2,ELASTIC NET,alpha,0.1
3,ELASTIC NET,L1 Ratio,0.0


## Conclusion

In [26]:
print('OLS Method has the least test error of', "{:.5f}".format(ols_error),
      'followed by Ridge Regression with a test error of', "{:.5f}".format(
          ridge_error),
      '. The Lasso and Elastic-Net Models have the worst performance metric with test errors of',
      "{:.5f}".format(lasso_error), 'and', "{:.5f}".format(enet_error), 'respectively. Additionally, the L1 Ratio of the elastic net model is', 
      "{:.5f}".format(best_r), 'which signifies that Elastic-Net model prefers the Ridge Regression model above the Lasso Model.')

OLS Method has the least test error of 0.44749 followed by Ridge Regression with a test error of 0.45922 . The Lasso and Elastic-Net Models have the worst performance metric with test errors of 0.67359 and 0.67359 respectively. Additionally, the L1 Ratio of the elastic net model is 0.00000 which signifies that Elastic-Net model prefers the Ridge Regression model above the Lasso Model.
