In [2]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [3]:
# Dependencies
import numpy as np
import pandas as pd

In [4]:
import keras
keras.__version__

Using TensorFlow backend.


'2.3.1'

In [5]:
# Data Set Information:

# The two datasets are related to red and white variants of the Portuguese "Vinho Verde" wine. For more details, consult: [Web Link] or the reference [Cortez et al., 2009]. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).

# These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are many more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.


# Attribute Information:

# For more information, read [Cortez et al., 2009].
# Input variables (based on physicochemical tests):
# 1 - fixed acidity
# 2 - volatile acidity
# 3 - citric acid
# 4 - residual sugar
# 5 - chlorides
# 6 - free sulfur dioxide
# 7 - total sulfur dioxide
# 8 - density
# 9 - pH
# 10 - sulphates
# 11 - alcohol
# Output variable (based on sensory data):
# 12 - quality (score between 0 and 10)

In [6]:
survey = pd.read_csv('Resources/winequality-red.csv')
survey.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
# purged_df = survey[["alcohol", "sulphates", "total sulfur dioxide", "volatile acidity", "quality"]]
# purged_df.head()

## Data Pre-Processing

In [8]:
X = survey.drop("quality", axis=1)
y = survey["quality"]
print(X.shape, y.shape)

(1599, 11) (1599,)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.utils import to_categorical

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y, train_size=0.75, test_size=0.25)


In [11]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [13]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Create a Deep Learning Model

In [14]:
from keras.models import Sequential
from keras.layers import Dense

In [15]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=11))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
# model.add(Dense(units=100, activation='relu'))
# model.add(Dense(units=100, activation='relu'))
# model.add(Dense(units=100, activation='relu'))
# model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=6, activation='softmax'))

In [16]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               1200      
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 606       
Total params: 32,106
Trainable params: 32,106
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 0s - loss: 1.3653 - accuracy: 0.4204
Epoch 2/100
 - 0s - loss: 1.1449 - accuracy: 0.4846
Epoch 3/100
 - 0s - loss: 1.0519 - accuracy: 0.5388
Epoch 4/100
 - 0s - loss: 1.0022 - accuracy: 0.5897
Epoch 5/100
 - 0s - loss: 0.9810 - accuracy: 0.5897
Epoch 6/100
 - 0s - loss: 0.9661 - accuracy: 0.6030
Epoch 7/100
 - 0s - loss: 0.9594 - accuracy: 0.6155
Epoch 8/100
 - 0s - loss: 0.9621 - accuracy: 0.5922
Epoch 9/100
 - 0s - loss: 0.9510 - accuracy: 0.5997
Epoch 10/100
 - 0s - loss: 0.9343 - accuracy: 0.6222
Epoch 11/100
 - 0s - loss: 0.9245 - accuracy: 0.6180
Epoch 12/100
 - 0s - loss: 0.9322 - accuracy: 0.5963
Epoch 13/100
 - 0s - loss: 0.9233 - accuracy: 0.6038
Epoch 14/100
 - 0s - loss: 0.9123 - accuracy: 0.6197
Epoch 15/100
 - 0s - loss: 0.9038 - accuracy: 0.6205
Epoch 16/100
 - 0s - loss: 0.9184 - accuracy: 0.6138
Epoch 17/100
 - 0s - loss: 0.9132 - accuracy: 0.6172
Epoch 18/100
 - 0s - loss: 0.9068 - accuracy: 0.6163
Epoch 19/100
 - 0s - loss: 0.8999 - accuracy: 0.6205
Ep

<keras.callbacks.callbacks.History at 0x2bc917b0be0>

## Quantify our Trained Model

In [19]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 1.2307132434844972, Accuracy: 0.6000000238418579


## Make Predictions

In [20]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [21]:
print(f"Predicted classes: {list(y_train_categorical[:5])}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: [array([0., 0., 1., 0., 0., 0.], dtype=float32), array([0., 0., 1., 0., 0., 0.], dtype=float32), array([0., 0., 1., 0., 0., 0.], dtype=float32), array([0., 0., 0., 1., 0., 0.], dtype=float32), array([0., 0., 0., 1., 0., 0.], dtype=float32)]
Actual Labels: [6, 5, 8, 4, 5]


In [22]:
# Save the model
model.save('redwinequality_model_trained.h5')

In [23]:
# Load the model
from keras.models import load_model
survey_model = load_model('redwinequality_model_trained.h5')

In [24]:
#Evaluate
model_loss, model_accuracy = survey_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2
)

print(f'Loaded Model Loss: {model_loss}, Accuracy: {model_accuracy}')

Loaded Model Loss: 1.2307132434844972, Accuracy: 0.6000000238418579
