### Wine Quality Classification using Keras Sequential ANN

#### Import libraries

In [67]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Load dataset

In [68]:
dataset = pd.read_csv('winequality-white.csv', index_col=None, sep=";")
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [69]:
dataset.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [70]:
feature_dataset = dataset.iloc[:,0:-1]
target_dataset = dataset.iloc[:,-1]

In [71]:
feature_dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8


In [72]:
dataset.quality.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [73]:
dataset.quality.nunique()

7

In [74]:
all_cols = feature_dataset.columns.values
all_cols

array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype=object)

#### Feature scaling

In [85]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
#create the specs for the column transformer
#the function "make_column_transformer" will create the column transformer object
#categories="auto" and drop="first" tell the encoder to create k-1 columns, rather than k columns
# and that the first category will be the implicit category
# preprocess = make_column_transformer(
#     (['''select columns to be scaled'''], StandardScaler()))
# )

# option 1 
# preprocess = make_column_transformer((StandardScaler(), all_cols))

# option 2
cat_cols = []
preprocess = make_column_transformer(
    (OneHotEncoder(categories="auto", drop="first"), cat_cols),
    remainder = StandardScaler())



#fit_transform is the function that fits the transformations to the data and then does the transformation
X = preprocess.fit_transform(feature_dataset)

# one hot encode the target


dummy_y = pd.get_dummies(dataset.quality)



In [76]:
dataset.quality

0       6
1       6
2       6
3       6
4       6
       ..
4893    6
4894    5
4895    6
4896    7
4897    6
Name: quality, Length: 4898, dtype: int64

In [77]:
pd.get_dummies(dataset.quality)

Unnamed: 0,3,4,5,6,7,8,9
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
4893,0,0,0,1,0,0,0
4894,0,0,1,0,0,0,0
4895,0,0,0,1,0,0,0
4896,0,0,0,0,1,0,0


In [78]:
X.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [79]:
X.mean(axis=0)

array([-3.48163039e-16,  4.49710592e-16,  1.16054346e-17, -1.06624931e-16,
        2.32108693e-17, -1.01547553e-17, -4.64217386e-17,  3.25648496e-14,
       -1.18375433e-15, -8.21809840e-16, -3.71373908e-16])

#### Train Test Split

In [80]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.20, 
                                                    shuffle=True, random_state=47)

In [81]:
X_train.shape

(3918, 11)

In [82]:
y_train.shape

(3918, 7)

#### Keras Sequential ANN Model

In [83]:
# Sequential Artificial Neural Network Model
from keras.models import Sequential
from keras.layers import Dense, Dropout

# define your keras model here

model = Sequential()

model.add(Dense(50, input_dim = X_train.shape[1], activation='relu'))
model.add(Dropout(0.1))


model.add(Dense(50, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(50, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(50, activation='relu'))


model.add(Dense(7, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 50)                600       
_________________________________________________________________
dropout_8 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 50)                2550      
_________________________________________________________________
dropout_9 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 50)                2550      
_________________________________________________________________
dropout_10 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 50)               

In [42]:
# train the model in 32 batch size and 100-500 epochs
batch_size = 32
epochs = 500
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.callbacks.History at 0x23a8a077788>

#### Prediction an accuracy on train data

In [43]:
y_pred = model.predict(X_train)

In [44]:
y_pred

array([[1.1718430e-06, 1.9082200e-02, 1.7298785e-01, ..., 7.8111440e-03,
        9.6601213e-04, 2.7852937e-07],
       [6.6531220e-06, 1.0066050e-02, 3.9382866e-01, ..., 3.7806069e-03,
        3.4130546e-06, 2.3696150e-07],
       [2.1752027e-07, 1.3251359e-03, 8.3154637e-01, ..., 2.2936314e-02,
        9.1041602e-06, 2.7973510e-10],
       ...,
       [5.0088797e-08, 1.7509298e-04, 2.7553755e-01, ..., 6.1030814e-04,
        7.4141792e-07, 3.9641679e-10],
       [3.4426066e-08, 1.0336059e-04, 5.5961750e-02, ..., 1.4402276e-01,
        8.9645931e-05, 1.2490463e-07],
       [5.2260351e-05, 1.3131969e-05, 2.1619591e-01, ..., 2.6300631e-08,
        8.7757096e-10, 1.7672406e-15]], dtype=float32)

In [47]:
dummy_y

Unnamed: 0,3,4,5,6,7,8,9
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
4893,0,0,0,1,0,0,0
4894,0,0,1,0,0,0,0
4895,0,0,0,1,0,0,0
4896,0,0,0,0,1,0,0


In [48]:
y_pred_ = y_pred.argmax(axis=1)
y_pred_

array([3, 3, 2, ..., 3, 3, 3], dtype=int64)

In [49]:
dummy_y.columns

Int64Index([3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [51]:
y_pred_product = dummy_y.columns[y_pred_]
y_pred_product

Int64Index([6, 6, 5, 6, 6, 6, 6, 6, 7, 5,
            ...
            6, 6, 7, 5, 6, 6, 5, 6, 6, 6],
           dtype='int64', length=3918)

In [53]:
y_train

Unnamed: 0,3,4,5,6,7,8,9
2480,0,0,1,0,0,0,0
665,0,0,1,0,0,0,0
4197,0,0,1,0,0,0,0
4461,0,0,0,1,0,0,0
4435,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
3095,0,0,0,1,0,0,0
2896,0,0,1,0,0,0,0
691,0,0,0,1,0,0,0
3336,0,0,0,1,0,0,0


In [54]:
y_train_ = y_train.values.argmax(axis=1)
y_train_

array([2, 2, 2, ..., 3, 3, 3], dtype=int64)

In [55]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_train_, y_pred_)
accuracy


0.9017355793772333

In [57]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train_, y_pred_)
cm

array([[  13,    0,    0,    1,    0,    0,    0],
       [   0,   96,   16,   10,    0,    0,    0],
       [   0,    4, 1008,  132,   11,    0,    0],
       [   0,    1,   60, 1651,   61,    2,    0],
       [   0,    0,    4,   44,  660,    1,    0],
       [   0,    0,    1,    2,   35,  100,    0],
       [   0,    0,    0,    0,    0,    0,    5]], dtype=int64)

#### Prediction and accuracy on test data

In [60]:
y_pred = model.predict(X_test)
y_pred_ = y_pred.argmax(axis=1)

In [61]:
y_test_ = y_test.values.argmax(axis=1)

In [65]:

accuracy = accuracy_score(y_test_, y_pred_)
accuracy

0.6265306122448979

In [66]:
cm = confusion_matrix(y_test_, y_pred_)
cm

array([[  1,   0,   2,   3,   0,   0],
       [  1,   7,  19,  13,   1,   0],
       [  0,   5, 195,  93,   9,   0],
       [  1,   2,  64, 295,  56,   5],
       [  0,   0,   5,  52, 108,   6],
       [  0,   0,   1,   8,  20,   8]], dtype=int64)