In [1]:
import numpy as np

In [2]:
np.set_printoptions(suppress = True, linewidth = 100)

In [3]:
raw_white_wine = np.genfromtxt("winequality-white.csv", delimiter = ';', skip_header = 1)
raw_white_wine

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]])

In [4]:
raw_white_wine.shape

(4898, 12)

In [None]:
# legends
# fixed acidity (g(tartaric acid)/L), volatile acidity (g(acetic acid)/L), citric acid(g/L)
# residual sugar (g/L), chlorides(g(sodium chloride)/L), 
# free sulfur dioxide (mg/L), total sulfur dioxide (mg/L)
# density (g/ml), pH , sulphates(g(potassium sulphate)/L), alcohol ( % vol.)

In [5]:
header_white = np.genfromtxt("winequality-white.csv", delimiter = ';', skip_footer = raw_white_wine.shape[0], autostrip = True, dtype = 'str')
header_white

array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
       '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
       '"alcohol"', '"quality"'], dtype='<U22')

In [6]:
# screeening for missing values

np.isnan(raw_white_wine).sum()

0

In [7]:
header_white[0]

'"fixed acidity"'

In [8]:
raw_white_wine[:,0]

array([7. , 6.3, 8.1, ..., 6.5, 5.5, 6. ])

In [9]:
stats_whitewine = np.array([np.min(raw_white_wine[:,0]), np.max(raw_white_wine[:,0])])
stats_whitewine

array([ 3.8, 14.2])

In [10]:
np.unique(raw_white_wine[:,0])

array([ 3.8 ,  3.9 ,  4.2 ,  4.4 ,  4.5 ,  4.6 ,  4.7 ,  4.8 ,  4.9 ,  5.  ,  5.1 ,  5.2 ,  5.3 ,
        5.4 ,  5.5 ,  5.6 ,  5.7 ,  5.8 ,  5.9 ,  6.  ,  6.1 ,  6.15,  6.2 ,  6.3 ,  6.4 ,  6.45,
        6.5 ,  6.6 ,  6.7 ,  6.8 ,  6.9 ,  7.  ,  7.1 ,  7.15,  7.2 ,  7.3 ,  7.4 ,  7.5 ,  7.6 ,
        7.7 ,  7.8 ,  7.9 ,  8.  ,  8.1 ,  8.2 ,  8.3 ,  8.4 ,  8.5 ,  8.6 ,  8.7 ,  8.8 ,  8.9 ,
        9.  ,  9.1 ,  9.2 ,  9.3 ,  9.4 ,  9.5 ,  9.6 ,  9.7 ,  9.8 ,  9.9 , 10.  , 10.2 , 10.3 ,
       10.7 , 11.8 , 14.2 ])

In [11]:
# grouping fixed acidity into 4 stages: 
# low value of parameter (3.8-6.2)-- denoted by 1
# notr value of (6.3-7.9)-- denoted by 2
# high value of (8.0-9.5)-- denoted by 3
# very high value (9.6-14.2)-- denoted by 4

low_value = np.array([3.8 ,  3.9 ,  4.2 ,  4.4 ,  4.5 ,  4.6 ,  4.7 ,  4.8 ,  4.9 ,  
                      5.  ,  5.1 ,  5.2 ,  5.3 , 5.4 ,  5.5 ,  5.6 ,  5.7 ,  5.8 ,  5.9 ,  
                      6.  ,  6.1 ,  6.15,  6.2])
notr_value = np.array([6.3 ,  6.4 ,  6.45, 6.5 ,  6.6 ,  6.7 ,  6.8 ,  6.9 ,  
                       7.  ,  7.1 ,  7.15,  7.2 ,  7.3 ,  7.4 ,  7.5 ,  7.6 , 7.7 ,  7.8 ,  7.9])
high_value = np.array([8.  ,  8.1 ,  8.2 ,  8.3 ,  8.4 ,  8.5 ,  8.6 ,  8.7 ,  8.8 ,  8.9 , 
                       9.  ,  9.1 ,  9.2 ,  9.3 ,  9.4 ,  9.5])
very_high = np.array([9.6 ,  9.7 ,  9.8 ,  9.9 , 10.  , 10.2 , 10.3 , 10.7 , 11.8 , 14.2])

In [12]:
raw_white_wine[:,0] = np.where(np.isin(raw_white_wine[:,0], low_value), 1, raw_white_wine[:,0])
raw_white_wine[:,0] = np.where(np.isin(raw_white_wine[:,0], notr_value), 2, raw_white_wine[:,0])
raw_white_wine[:,0] = np.where(np.isin(raw_white_wine[:,0], high_value), 3, raw_white_wine[:,0])
raw_white_wine[:,0] = np.where(np.isin(raw_white_wine[:,0], very_high), 4, raw_white_wine[:,0])

In [13]:
# checking data after changes made

np.unique(raw_white_wine[:,0], return_counts = True)

(array([1., 2., 3., 4.]), array([1107, 3317,  445,   29]))

In [14]:
# checking data after changes made

len(raw_white_wine[:,0])

4898

In [15]:
header_white[1]

'"volatile acidity"'

In [16]:
np.unique(raw_white_wine[:,1])

array([0.08 , 0.085, 0.09 , 0.1  , 0.105, 0.11 , 0.115, 0.12 , 0.125, 0.13 , 0.135, 0.14 , 0.145,
       0.15 , 0.155, 0.16 , 0.165, 0.17 , 0.175, 0.18 , 0.185, 0.19 , 0.2  , 0.205, 0.21 , 0.215,
       0.22 , 0.225, 0.23 , 0.235, 0.24 , 0.245, 0.25 , 0.255, 0.26 , 0.265, 0.27 , 0.275, 0.28 ,
       0.285, 0.29 , 0.295, 0.3  , 0.305, 0.31 , 0.315, 0.32 , 0.325, 0.33 , 0.335, 0.34 , 0.345,
       0.35 , 0.355, 0.36 , 0.365, 0.37 , 0.375, 0.38 , 0.385, 0.39 , 0.395, 0.4  , 0.405, 0.41 ,
       0.415, 0.42 , 0.425, 0.43 , 0.435, 0.44 , 0.445, 0.45 , 0.455, 0.46 , 0.47 , 0.475, 0.48 ,
       0.485, 0.49 , 0.495, 0.5  , 0.51 , 0.52 , 0.53 , 0.54 , 0.545, 0.55 , 0.555, 0.56 , 0.57 ,
       0.58 , 0.585, 0.59 , 0.595, 0.6  , 0.61 , 0.615, 0.62 , 0.63 , 0.64 , 0.65 , 0.655, 0.66 ,
       0.67 , 0.68 , 0.685, 0.69 , 0.695, 0.705, 0.71 , 0.73 , 0.74 , 0.75 , 0.76 , 0.78 , 0.785,
       0.815, 0.85 , 0.905, 0.91 , 0.93 , 0.965, 1.005, 1.1  ])

In [17]:
# The average volatile acidity value for white table wines is about 0.43g/L (Vilela, 2018).
# data for volatile acidity of white wine will be classified into 3: below average(0.080- 0.375)-- denoted by 0
# average (0.380- 0.500)-- denoted by 1
# above average (0.510- 1.100)-- denoted by 2

below_average = np.array([0.08 , 0.085, 0.09 , 0.1  , 0.105, 0.11 , 0.115, 0.12 , 0.125, 
                          0.13 , 0.135, 0.14 , 0.145, 0.15 , 0.155, 0.16 , 0.165, 0.17 , 0.175, 
                          0.18 , 0.185, 0.19 , 0.2  , 0.205, 0.21 , 0.215, 
                          0.22 , 0.225, 0.23 , 0.235, 0.24 , 0.245, 0.25 , 0.255, 0.26 , 0.265, 
                          0.27 , 0.275, 0.28 , 0.285, 0.29 , 0.295, 0.3  , 0.305, 
                          0.31 , 0.315, 0.32 , 0.325, 0.33 , 0.335, 0.34 , 0.345, 0.35 , 0.355, 
                          0.36 , 0.365, 0.37 , 0.375])
average = np.array([0.38 , 0.385, 0.39 , 0.395, 0.4  , 0.405, 0.41 , 0.415, 0.42 , 0.425, 0.43 , 0.435, 
                    0.44 , 0.445, 0.45 , 0.455, 0.46 , 0.47 , 0.475, 0.48 , 0.485, 0.49 , 0.495, 0.5])
above_average = np.array([0.51 , 0.52 , 0.53 , 0.54 , 0.545, 0.55 , 0.555, 0.56 , 0.57 , 0.58 , 0.585, 
                          0.59 , 0.595, 0.6  , 0.61 , 0.615, 0.62 , 0.63 , 0.64 , 0.65 , 0.655, 0.66 , 
                          0.67 , 0.68 , 0.685, 0.69 , 0.695, 0.705, 0.71 , 0.73 , 0.74 , 0.75 , 0.76 , 
                          0.78 , 0.785, 0.815, 0.85 , 0.905, 0.91 , 0.93 , 0.965, 1.005, 1.1])

In [18]:
raw_white_wine[:,1] = np.where(np.isin(raw_white_wine[:,1], above_average), 2, raw_white_wine[:,1])

In [19]:
raw_white_wine[:,1] = np.where(np.isin(raw_white_wine[:,1], average), 1, raw_white_wine[:,1])

In [20]:
raw_white_wine[:,1] = np.where(np.isin(raw_white_wine[:,1], below_average), 0, raw_white_wine[:,1])

In [21]:
np.unique(raw_white_wine[:,1])

array([0., 1., 2.])

In [22]:
len(raw_white_wine[:,1])

4898

In [23]:
header_white[2]

'"citric acid"'

In [24]:
np.unique(raw_white_wine[:,2])

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14,
       0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29,
       0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
       0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74,
       0.78, 0.79, 0.8 , 0.81, 0.82, 0.86, 0.88, 0.91, 0.99, 1.  , 1.23, 1.66])

In [25]:
# It can be added to finished wines to increase acidity and give a “fresh" flavor.
# The disadvantage of adding citric acid is its microbial instability. 
# Citric acid is one of the less commonly found acid’s in wine. Thus we will employ binary grouping.
# we will classify citric acid(cc) data column according to its existence: not added-- denoted by 0.
# citric acid added-- denoted by 1.
#!! for non exist- the range is applied since 0.04 can be just noise due to instrumental measurement.

cc_not_added = np.array([0. , 0.01, 0.02, 0.03, 0.04])
cc_added = np.array([0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 
                     0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 
                     0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 
                     0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 
                     0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 
                     0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 
                     0.7 , 0.71, 0.72, 0.73, 0.74, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.86, 0.88, 
                     0.91, 0.99, 1.  , 1.23, 1.66])

In [26]:
raw_white_wine[:,2] = np.where(np.isin(raw_white_wine[:,2], cc_added), 1, raw_white_wine[:,2])

In [27]:
raw_white_wine[:,2] = np.where(np.isin(raw_white_wine[:,2], cc_not_added), 0, raw_white_wine[:,2])

In [28]:
np.unique(raw_white_wine[:,2])

array([0., 1.])

In [29]:
len(raw_white_wine[:,2])

4898

In [30]:
def checkpoint(file_name, checkpoint_header, checkpoint_data):
    np.savez(file_name, header = checkpoint_header, data = checkpoint_data)
    checkpoint_variable = np.load(file_name + ".npz")
    return(checkpoint_variable)

In [31]:
WhiteWine_checkpoint_I = checkpoint("WhiteWine-checkpoint-I", header_white, raw_white_wine)

In [32]:
WhiteWine_checkpoint_I['header'], WhiteWine_checkpoint_I['data']

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 array([[ 2.  ,  0.  ,  1.  , ...,  0.45,  8.8 ,  6.  ],
        [ 2.  ,  0.  ,  1.  , ...,  0.49,  9.5 ,  6.  ],
        [ 3.  ,  0.  ,  1.  , ...,  0.44, 10.1 ,  6.  ],
        ...,
        [ 2.  ,  0.  ,  1.  , ...,  0.46,  9.4 ,  6.  ],
        [ 1.  ,  0.  ,  1.  , ...,  0.38, 12.8 ,  7.  ],
        [ 1.  ,  0.  ,  1.  , ...,  0.32, 11.8 ,  6.  ]]))

In [33]:
header_white[3]

'"residual sugar"'

In [34]:
# Residual sugar levels vary in different styles of wine. 
# ‘Dry’ wines: 0-4 g/L-- denoted by 0 
# ‘Sweet’ wines: 35 g/L and then go up from there -- denoted by 1

dry_wine = np.arange(0.00, 35.00, 0.01)

In [35]:
dry_wine

array([ 0.  ,  0.01,  0.02, ..., 34.97, 34.98, 34.99])

In [36]:
np.unique(raw_white_wine[:,3])

array([ 0.6 ,  0.7 ,  0.8 ,  0.9 ,  0.95,  1.  ,  1.05,  1.1 ,  1.15,  1.2 ,  1.25,  1.3 ,  1.35,
        1.4 ,  1.45,  1.5 ,  1.55,  1.6 ,  1.65,  1.7 ,  1.75,  1.8 ,  1.85,  1.9 ,  1.95,  2.  ,
        2.05,  2.1 ,  2.2 ,  2.25,  2.3 ,  2.35,  2.4 ,  2.5 ,  2.6 ,  2.65,  2.7 ,  2.8 ,  2.85,
        2.9 ,  3.  ,  3.1 ,  3.15,  3.2 ,  3.3 ,  3.4 ,  3.5 ,  3.6 ,  3.7 ,  3.75,  3.8 ,  3.85,
        3.9 ,  3.95,  4.  ,  4.1 ,  4.2 ,  4.25,  4.3 ,  4.35,  4.4 ,  4.45,  4.5 ,  4.55,  4.6 ,
        4.7 ,  4.75,  4.8 ,  4.85,  4.9 ,  5.  ,  5.1 ,  5.15,  5.2 ,  5.25,  5.3 ,  5.35,  5.4 ,
        5.45,  5.5 ,  5.55,  5.6 ,  5.7 ,  5.8 ,  5.85,  5.9 ,  5.95,  6.  ,  6.1 ,  6.2 ,  6.3 ,
        6.35,  6.4 ,  6.5 ,  6.55,  6.6 ,  6.65,  6.7 ,  6.75,  6.8 ,  6.85,  6.9 ,  6.95,  7.  ,
        7.05,  7.1 ,  7.2 ,  7.25,  7.3 ,  7.35,  7.4 ,  7.45,  7.5 ,  7.6 ,  7.7 ,  7.75,  7.8 ,
        7.85,  7.9 ,  7.95,  8.  ,  8.1 ,  8.15,  8.2 ,  8.25,  8.3 ,  8.4 ,  8.45,  8.5 ,  8.55,
        8.6 ,  8.65,

In [37]:
raw_white_wine[:,3] = np.where(np.isin(raw_white_wine[:,3], dry_wine), 0, 1)

In [38]:
np.unique(raw_white_wine[:,3])

array([0., 1.])

In [39]:
len(raw_white_wine[:,3])

4898

In [40]:
header_white, header_white[4]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"chlorides"')

In [41]:
np.unique(raw_white_wine[:,4])

array([0.009, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, 0.019, 0.02 , 0.021, 0.022, 0.023,
       0.024, 0.025, 0.026, 0.027, 0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
       0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049,
       0.05 , 0.051, 0.052, 0.053, 0.054, 0.055, 0.056, 0.057, 0.058, 0.059, 0.06 , 0.061, 0.062,
       0.063, 0.064, 0.065, 0.066, 0.067, 0.068, 0.069, 0.07 , 0.071, 0.072, 0.073, 0.074, 0.075,
       0.076, 0.077, 0.078, 0.079, 0.08 , 0.081, 0.082, 0.083, 0.084, 0.085, 0.086, 0.087, 0.088,
       0.089, 0.09 , 0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097, 0.098, 0.099, 0.102, 0.104,
       0.105, 0.108, 0.11 , 0.112, 0.114, 0.115, 0.117, 0.118, 0.119, 0.12 , 0.121, 0.122, 0.123,
       0.126, 0.127, 0.13 , 0.132, 0.133, 0.135, 0.136, 0.137, 0.138, 0.142, 0.144, 0.145, 0.146,
       0.147, 0.148, 0.149, 0.15 , 0.152, 0.154, 0.156, 0.157, 0.158, 0.16 , 0.167, 0.168, 0.169,
       0.17 , 0.171,

In [42]:
# Chlorides (sodium chloride) give the wine a salty flavor which may turn away potential consumers. 
# The max. conc. of chlorides in wine is about 0.20 - 0.60 g/L (Vallone et al.,2021).
# Not salty (below_max. conc.range) -- denoted by 0.
# Salty (within max. conc.range) -- denoted by 1.

not_salty = np.arange(0.000, 0.199, 0.001)

In [43]:
not_salty

array([0.   , 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01 , 0.011, 0.012,
       0.013, 0.014, 0.015, 0.016, 0.017, 0.018, 0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025,
       0.026, 0.027, 0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, 0.037, 0.038,
       0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049, 0.05 , 0.051,
       0.052, 0.053, 0.054, 0.055, 0.056, 0.057, 0.058, 0.059, 0.06 , 0.061, 0.062, 0.063, 0.064,
       0.065, 0.066, 0.067, 0.068, 0.069, 0.07 , 0.071, 0.072, 0.073, 0.074, 0.075, 0.076, 0.077,
       0.078, 0.079, 0.08 , 0.081, 0.082, 0.083, 0.084, 0.085, 0.086, 0.087, 0.088, 0.089, 0.09 ,
       0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097, 0.098, 0.099, 0.1  , 0.101, 0.102, 0.103,
       0.104, 0.105, 0.106, 0.107, 0.108, 0.109, 0.11 , 0.111, 0.112, 0.113, 0.114, 0.115, 0.116,
       0.117, 0.118, 0.119, 0.12 , 0.121, 0.122, 0.123, 0.124, 0.125, 0.126, 0.127, 0.128, 0.129,
       0.13 , 0.131,

In [44]:
raw_white_wine[:,4] = np.where(np.isin(raw_white_wine[:,4], not_salty), 0, 1)

In [45]:
np.unique(raw_white_wine[:,4], return_counts = True)

(array([0., 1.]), array([4186,  712]))

In [46]:
header_white, header_white[5]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"free sulfur dioxide"')

In [47]:
np.unique(raw_white_wine[:,5])

array([  2. ,   3. ,   4. ,   5. ,   6. ,   7. ,   8. ,   9. ,  10. ,  11. ,  11.5,  12. ,  13. ,
        14. ,  15. ,  15.5,  16. ,  17. ,  18. ,  19. ,  19.5,  20. ,  21. ,  22. ,  23. ,  23.5,
        24. ,  25. ,  26. ,  27. ,  28. ,  28.5,  29. ,  30. ,  30.5,  31. ,  32. ,  33. ,  34. ,
        35. ,  35.5,  36. ,  37. ,  38. ,  38.5,  39. ,  39.5,  40. ,  40.5,  41. ,  41.5,  42. ,
        42.5,  43. ,  43.5,  44. ,  44.5,  45. ,  46. ,  47. ,  48. ,  48.5,  49. ,  50. ,  50.5,
        51. ,  51.5,  52. ,  52.5,  53. ,  54. ,  55. ,  56. ,  57. ,  58. ,  59. ,  59.5,  60. ,
        60.5,  61. ,  61.5,  62. ,  63. ,  64. ,  64.5,  65. ,  66. ,  67. ,  68. ,  69. ,  70. ,
        70.5,  71. ,  72. ,  73. ,  73.5,  74. ,  75. ,  76. ,  77. ,  77.5,  78. ,  79. ,  79.5,
        80. ,  81. ,  82. ,  82.5,  83. ,  85. ,  86. ,  87. ,  88. ,  89. ,  93. ,  95. ,  96. ,
        97. ,  98. , 101. , 105. , 108. , 110. , 112. , 118.5, 122.5, 124. , 128. , 131. , 138.5,
       146.5, 289. ]

In [48]:
# SO2 and its sulfite salts are essential for anti-oxidation and preservation properties. 
# SO2 Total = SO2 free + SO2 reacted.
# During maturation and storage, free SO2 conc. values of 30 mg/L on white wine are recommended.
# not recommended conc. is denoted by 0.
# recommended conc. range (30-3, 30+3) is denoted by 1.

recommended_range = np.array([27. ,  28. ,  28.5,  29. ,  30. ,  30.5,  31. ,  32. ,  33.])

In [49]:
raw_white_wine[:,5] = np.where(np.isin(raw_white_wine[:,5], recommended_range), 1, 0)

In [50]:
np.unique(raw_white_wine[:,5])

array([0., 1.])

In [51]:
len(raw_white_wine[:,5])

4898

In [52]:
header_white, header_white[6]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"total sulfur dioxide"')

In [53]:
np.unique(raw_white_wine[:,6], return_counts = True)

(array([  9. ,  10. ,  18. ,  19. ,  21. ,  24. ,  25. ,  26. ,  28. ,  29. ,  30. ,  31. ,  33. ,
         34. ,  37. ,  40. ,  41. ,  44. ,  45. ,  46. ,  47. ,  48. ,  49. ,  50. ,  51. ,  53. ,
         54. ,  55. ,  56. ,  57. ,  58. ,  59. ,  60. ,  61. ,  62. ,  63. ,  64. ,  65. ,  66. ,
         67. ,  68. ,  69. ,  70. ,  71. ,  72. ,  73. ,  74. ,  75. ,  76. ,  77. ,  78. ,  79. ,
         80. ,  81. ,  82. ,  83. ,  84. ,  85. ,  86. ,  87. ,  88. ,  89. ,  90. ,  91. ,  92. ,
         93. ,  94. ,  95. ,  96. ,  97. ,  98. ,  99. , 100. , 101. , 102. , 103. , 104. , 105. ,
        106. , 107. , 108. , 109. , 110. , 111. , 112. , 113. , 114. , 115. , 115.5, 116. , 117. ,
        118. , 119. , 120. , 121. , 122. , 123. , 124. , 125. , 126. , 127. , 128. , 129. , 129.5,
        130. , 131. , 132. , 133. , 134. , 135. , 136. , 137. , 138. , 139. , 140. , 141. , 142. ,
        143. , 144. , 145. , 146. , 147. , 148. , 149. , 150. , 151. , 152. , 153. , 154. , 155. ,
        15

In [54]:
# EU allows white wine to have maximum total sulfur dioxide between 200-350 mg/L.
# not recommended range(350 and above)-- is denoted by 0.
# values not exceeding the range-- is denoted by 1. 

out_of_limit = np.array([366.5, 440.])

In [55]:
raw_white_wine[:,6] = np.where(np.isin(raw_white_wine[:,6], out_of_limit), 0, 1)

In [56]:
np.unique(raw_white_wine[:,6], return_counts = True)

(array([0., 1.]), array([   2, 4896]))

In [57]:
header_white, header_white[7]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"density"')

In [58]:
np.unique(raw_white_wine[:,7], return_counts = True)

(array([0.98711 , 0.98713 , 0.98722 , 0.9874  , 0.98742 , 0.98746 , 0.98758 , 0.98774 , 0.98779 ,
        0.98794 , 0.98802 , 0.98815 , 0.98816 , 0.98819 , 0.98822 , 0.98823 , 0.988245, 0.98834 ,
        0.98836 , 0.9884  , 0.98845 , 0.98853 , 0.98854 , 0.98856 , 0.9886  , 0.98862 , 0.98865 ,
        0.98867 , 0.98868 , 0.98869 , 0.9887  , 0.98871 , 0.98872 , 0.98876 , 0.98878 , 0.9888  ,
        0.98882 , 0.98883 , 0.98884 , 0.98886 , 0.98889 , 0.9889  , 0.98892 , 0.98894 , 0.98895 ,
        0.98896 , 0.98898 , 0.989   , 0.98902 , 0.98904 , 0.98906 , 0.9891  , 0.98912 , 0.98913 ,
        0.98914 , 0.98915 , 0.98916 , 0.98918 , 0.98919 , 0.9892  , 0.98922 , 0.98923 , 0.98924 ,
        0.98926 , 0.98928 , 0.9893  , 0.98931 , 0.989315, 0.98934 , 0.98935 , 0.98936 , 0.98938 ,
        0.98939 , 0.9894  , 0.98941 , 0.98942 , 0.989435, 0.98944 , 0.98945 , 0.98946 , 0.989465,
        0.98947 , 0.98948 , 0.98949 , 0.9895  , 0.98951 , 0.98952 , 0.98953 , 0.98954 , 0.98956 ,
        0.98958 , 0.

In [59]:
# Density is the mass per unit volume of wine. 
# After fermentation is complete, the wine should be at, or slightly less than 1.00.
# above 1.1g/ml threshold value-- is denoted by 0.
# correct density range is denoted by 1. 

above_threshold = np.arange(1.1, 2.1, 0.000001)

In [60]:
raw_white_wine[:,7] = np.where(np.isin(raw_white_wine[:,7], above_threshold), 0, 1)

In [61]:
np.unique(raw_white_wine[:,7], return_counts = True)

(array([1.]), array([4898]))

In [62]:
header_white, header_white[8]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"pH"')

In [63]:
np.unique(raw_white_wine[:,8], return_counts = True)

(array([2.72, 2.74, 2.77, 2.79, 2.8 , 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, 2.88, 2.89, 2.9 , 2.91,
        2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99, 3.  , 3.01, 3.02, 3.03, 3.04, 3.05, 3.06,
        3.07, 3.08, 3.09, 3.1 , 3.11, 3.12, 3.13, 3.14, 3.15, 3.16, 3.17, 3.18, 3.19, 3.2 , 3.21,
        3.22, 3.23, 3.24, 3.25, 3.26, 3.27, 3.28, 3.29, 3.3 , 3.31, 3.32, 3.33, 3.34, 3.35, 3.36,
        3.37, 3.38, 3.39, 3.4 , 3.41, 3.42, 3.43, 3.44, 3.45, 3.46, 3.47, 3.48, 3.49, 3.5 , 3.51,
        3.52, 3.53, 3.54, 3.55, 3.56, 3.57, 3.58, 3.59, 3.6 , 3.61, 3.62, 3.63, 3.64, 3.65, 3.66,
        3.67, 3.68, 3.69, 3.7 , 3.72, 3.74, 3.75, 3.76, 3.77, 3.79, 3.8 , 3.81, 3.82]),
 array([  1,   1,   1,   3,   3,   1,   4,   1,   9,   9,   9,  11,  17,  31,  15,  18,  38,  35,
         26,  63,  32,  41,  68,  74,  49,  68,  78,  97,  89, 115,  79, 136,  92, 135, 126, 134,
        117, 172, 136, 164, 124, 138, 145, 137,  95, 146, 116, 132, 114,  96,  88,  87,  82,  93,
         79,  86,  49,  79,  4

In [64]:
# pH level of a wine ranges from 3 to 4.
# normal_ph_range is denoted by 1.
# acidic than normal is denoted by 0.

normal_ph_range = np.arange(3.00, 4.01, 0.01)

In [65]:
normal_ph_range

array([3.  , 3.01, 3.02, 3.03, 3.04, 3.05, 3.06, 3.07, 3.08, 3.09, 3.1 , 3.11, 3.12, 3.13, 3.14,
       3.15, 3.16, 3.17, 3.18, 3.19, 3.2 , 3.21, 3.22, 3.23, 3.24, 3.25, 3.26, 3.27, 3.28, 3.29,
       3.3 , 3.31, 3.32, 3.33, 3.34, 3.35, 3.36, 3.37, 3.38, 3.39, 3.4 , 3.41, 3.42, 3.43, 3.44,
       3.45, 3.46, 3.47, 3.48, 3.49, 3.5 , 3.51, 3.52, 3.53, 3.54, 3.55, 3.56, 3.57, 3.58, 3.59,
       3.6 , 3.61, 3.62, 3.63, 3.64, 3.65, 3.66, 3.67, 3.68, 3.69, 3.7 , 3.71, 3.72, 3.73, 3.74,
       3.75, 3.76, 3.77, 3.78, 3.79, 3.8 , 3.81, 3.82, 3.83, 3.84, 3.85, 3.86, 3.87, 3.88, 3.89,
       3.9 , 3.91, 3.92, 3.93, 3.94, 3.95, 3.96, 3.97, 3.98, 3.99, 4.  ])

In [66]:
raw_white_wine[:,8] = np.where(np.isin(raw_white_wine[:,8], normal_ph_range), 1, 0)

In [67]:
np.unique(raw_white_wine[:,8], return_counts = True)

(array([0., 1.]), array([4775,  123]))

In [68]:
header_white, header_white[9]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"sulphates"')

In [69]:
np.unique(raw_white_wine[:,9], return_counts = True)

(array([0.22, 0.23, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37,
        0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52,
        0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67,
        0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82,
        0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.92, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99,
        1.  , 1.01, 1.06, 1.08]),
 array([  1,   1,   4,   4,  13,  13,  16,  31,  35,  54,  59,  84,  85, 120, 129, 214, 151, 168,
        139, 181, 161, 216, 178, 225, 172, 179, 166, 249, 140, 156, 135, 167, 102, 108,  83,  99,
         97,  88,  45,  68,  48,  67,  28,  36,  35,  44,  30,  27,  18,  33,  12,  19,  22,  19,
         16,  19,  16,   5,   5,  13,   2,   4,   3,   2,   2,   7,   1,   5,   2,   2,   5,   3,
          1,   6,   1,   1,   1,   1,   1]))

In [70]:
# Sulphates are a food preservative common in winemaking, since it maintain wine's flavor and freshness.
# They’re assoc. w/ long list of side effects like the dreaded wine-induced headache. 
# white wine typically can only contain up to 0.2 g/L of sulphites.
# exceeding 0.2 g/L -- denoted by 0.
# normal range-- denoted by 1.

normal_range = np.arange(0.00, 0.20, 0.01)

In [71]:
normal_range

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14,
       0.15, 0.16, 0.17, 0.18, 0.19])

In [72]:
raw_white_wine[:,9] = np.where(np.isin(raw_white_wine[:,9], normal_range), 1, 0)

In [73]:
np.unique(raw_white_wine[:,9], return_counts = True)

(array([0.]), array([4898]))

In [74]:
header_white, header_white[10]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"alcohol"')

In [75]:
np.unique(raw_white_wine[:,10], return_counts = True)

(array([ 8.        ,  8.4       ,  8.5       ,  8.6       ,  8.7       ,  8.8       ,  8.9       ,
         9.        ,  9.1       ,  9.2       ,  9.3       ,  9.4       ,  9.5       ,  9.53333333,
         9.55      ,  9.6       ,  9.63333333,  9.7       ,  9.73333333,  9.75      ,  9.8       ,
         9.9       , 10.        , 10.03333333, 10.1       , 10.13333333, 10.15      , 10.2       ,
        10.3       , 10.4       , 10.46666667, 10.5       , 10.53333333, 10.55      , 10.56666667,
        10.6       , 10.65      , 10.7       , 10.8       , 10.9       , 10.93333333, 10.96666667,
        10.98      , 11.        , 11.05      , 11.06666667, 11.1       , 11.2       , 11.26666667,
        11.3       , 11.33333333, 11.35      , 11.36666667, 11.4       , 11.43333333, 11.45      ,
        11.46666667, 11.5       , 11.55      , 11.6       , 11.63333333, 11.65      , 11.7       ,
        11.73333333, 11.75      , 11.8       , 11.85      , 11.9       , 11.94      , 11.95      ,
        12

In [76]:
# since data (above) has values with many decimal points, 
# rounding for the reliability of command and analaysis used. 

raw_white_wine[:,10] = np.round(raw_white_wine[:,10], decimals = 2)

In [77]:
np.unique(raw_white_wine[:,10], return_counts = True)

(array([ 8.  ,  8.4 ,  8.5 ,  8.6 ,  8.7 ,  8.8 ,  8.9 ,  9.  ,  9.1 ,  9.2 ,  9.3 ,  9.4 ,  9.5 ,
         9.53,  9.55,  9.6 ,  9.63,  9.7 ,  9.73,  9.75,  9.8 ,  9.9 , 10.  , 10.03, 10.1 , 10.13,
        10.15, 10.2 , 10.3 , 10.4 , 10.47, 10.5 , 10.53, 10.55, 10.57, 10.6 , 10.65, 10.7 , 10.8 ,
        10.9 , 10.93, 10.97, 10.98, 11.  , 11.05, 11.07, 11.1 , 11.2 , 11.27, 11.3 , 11.33, 11.35,
        11.37, 11.4 , 11.43, 11.45, 11.47, 11.5 , 11.55, 11.6 , 11.63, 11.65, 11.7 , 11.73, 11.75,
        11.8 , 11.85, 11.9 , 11.94, 11.95, 12.  , 12.05, 12.07, 12.1 , 12.15, 12.2 , 12.25, 12.3 ,
        12.33, 12.4 , 12.5 , 12.6 , 12.7 , 12.75, 12.8 , 12.89, 12.9 , 13.  , 13.05, 13.1 , 13.13,
        13.2 , 13.3 , 13.4 , 13.5 , 13.55, 13.6 , 13.7 , 13.8 , 13.9 , 14.  , 14.05, 14.2 ]),
 array([  2,   3,   9,  23,  78, 107,  95, 185, 144, 199, 134, 229, 228,   3,   2, 128,   1, 105,
          2,   1, 136, 109, 162,   1, 114,   2,   3, 130,  85, 153,   2, 160,   1,   2,   1, 114,
          1,  96,

In [78]:
# Wine can have anywhere between 5% and 23% Alcohol by Volume (ABV). 
# The avg alcohol content of wine is about 12%. 
# less than 11%-- light group which is denoted by 0.
# around 12% -- medium group denoted by 1.
# higher than 12% -- heavy group denoted by 2.

light_group = np.array([8.  ,  8.4 ,  8.5 ,  8.6 ,  8.7 ,  8.8 ,  8.9 ,  
                        9.  ,  9.1 ,  9.2 ,  9.3 ,  9.4 ,  9.5 , 9.53,  9.55,  9.6 ,  9.63,  
                        9.7 ,  9.73,  9.75,  9.8 ,  9.9 , 10.  , 10.03, 10.1 , 10.13, 10.15, 
                        10.2 , 10.3 , 10.4 , 10.47, 10.5 , 10.53, 10.55, 10.57, 10.6 , 10.65, 
                        10.7 , 10.8 , 10.9 , 10.93, 10.97, 10.98, 11.  , 11.05, 11.07, 
                        11.1 , 11.2 , 11.27, 11.3 , 11.33, 11.35, 11.37, 11.4 , 11.43, 11.45, 11.47, 
                        11.5 , 11.55, 11.6 , 11.63, 11.65,])
medium_group = np.array([11.7 , 11.73, 11.75, 11.8 , 11.85, 11.9 , 11.94, 11.95, 
                         12.  , 12.05, 12.07, 12.1 , 12.15, 12.2 , 12.25, 12.3 , 12.33])
heavy_group = np.array([12.4 , 12.5 , 12.6 , 12.7 , 12.75, 12.8 , 12.89, 12.9 , 
                        13.  , 13.05, 13.1 , 13.13, 13.2 , 13.3 , 13.4 , 13.5 , 13.55, 13.6 , 
                        13.7 , 13.8 , 13.9 , 14.  , 14.05, 14.2 ])

In [79]:
raw_white_wine[:,10] = np.where(np.isin(raw_white_wine[:,10], light_group), 0, raw_white_wine[:,10])
raw_white_wine[:,10] = np.where(np.isin(raw_white_wine[:,10], medium_group), 1, raw_white_wine[:,10])
raw_white_wine[:,10] = np.where(np.isin(raw_white_wine[:,10], heavy_group), 2, raw_white_wine[:,10])

In [80]:
np.unique(raw_white_wine[:,10], return_counts = True)

(array([0., 1., 2.]), array([3907,  485,  506]))

In [81]:
len(raw_white_wine[:,10])

4898

In [82]:
header_white, header_white[11]

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 '"quality"')

In [83]:
np.unique(raw_white_wine[:,11])

array([3., 4., 5., 6., 7., 8., 9.])

In [84]:
# The quality of the wines is a score between 1 and 10.
# low quality(0,1,2,3)-- denoted by 0.
# avg. quality(4,5,6,7)-- denoted by 1.
# high quality(8,9,10)-- denoted by 2.

low_quality = np.array([0,1,2,3])
avg_quality = np.array([4,5,6,7])
high_quality = np.array([8,9,10])

In [85]:
raw_white_wine[:,11] = np.where(np.isin(raw_white_wine[:,11], low_quality), 0, raw_white_wine[:,11])

In [86]:
raw_white_wine[:,11] = np.where(np.isin(raw_white_wine[:,11], avg_quality), 1, raw_white_wine[:,11])
raw_white_wine[:,11] = np.where(np.isin(raw_white_wine[:,11], high_quality), 2, raw_white_wine[:,11])

In [87]:
np.unique(raw_white_wine[:,11])

array([0., 1., 2.])

In [88]:
len(raw_white_wine[:,11])

4898

In [89]:
WhiteWine_checkpoint_II = checkpoint("WhiteWine-checkpoint-II", header_white, raw_white_wine)

In [90]:
WhiteWine_checkpoint_II['data']

array([[2., 0., 1., ..., 0., 0., 1.],
       [2., 0., 1., ..., 0., 0., 1.],
       [3., 0., 1., ..., 0., 0., 1.],
       ...,
       [2., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 2., 1.],
       [1., 0., 1., ..., 0., 1., 1.]])

In [91]:
raw_white_wine

array([[2., 0., 1., ..., 0., 0., 1.],
       [2., 0., 1., ..., 0., 0., 1.],
       [3., 0., 1., ..., 0., 0., 1.],
       ...,
       [2., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 2., 1.],
       [1., 0., 1., ..., 0., 1., 1.]])

In [92]:
white_wine_w_headers = np.vstack((header_white, raw_white_wine))
white_wine_w_headers

array([['"fixed acidity"', '"volatile acidity"', '"citric acid"', ..., '"sulphates"', '"alcohol"',
        '"quality"'],
       ['2.0', '0.0', '1.0', ..., '0.0', '0.0', '1.0'],
       ['2.0', '0.0', '1.0', ..., '0.0', '0.0', '1.0'],
       ...,
       ['2.0', '0.0', '1.0', ..., '0.0', '0.0', '1.0'],
       ['1.0', '0.0', '1.0', ..., '0.0', '2.0', '1.0'],
       ['1.0', '0.0', '1.0', ..., '0.0', '1.0', '1.0']], dtype='<U32')

In [93]:
grade_A = np.array(['2.0'])
grade_B = np.array(['1.0'])
grade_C = np.array(['0.0'])

In [94]:
white_wine_w_headers[:,11] = np.where(np.isin(white_wine_w_headers[:,11], grade_A), 'A', white_wine_w_headers[:,11])

In [95]:
white_wine_w_headers[:,11] = np.where(np.isin(white_wine_w_headers[:,11], grade_B), 'B', white_wine_w_headers[:,11])

In [96]:
white_wine_w_headers[:,11] = np.where(np.isin(white_wine_w_headers[:,11], grade_C), 'C', white_wine_w_headers[:,11])

In [97]:
np.unique(white_wine_w_headers[:,11])

array(['"quality"', 'A', 'B', 'C'], dtype='<U32')

In [98]:
np.savetxt("White-Wine-PreProcessed-wGrades.csv", white_wine_w_headers, fmt = '%s', delimiter = ',')

In [99]:
import pandas as pd

In [100]:
white_wine_df = pd.read_csv("White-Wine-PreProcessed-wGrades.csv", delimiter = ',')
white_wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,B
1,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
2,3.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,B
3,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
4,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
4894,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
4895,2.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,B
4896,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,B


In [101]:
# since there is no wine sample exceeding density threshold, 
# 7th column no longer carries valuable information for further analysis.

white_wine_df = white_wine_df.drop(axis = 1 , columns = 'density')
white_wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality
0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,B
1,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,B
2,3.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
3,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,B
4,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...,...
4893,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,B
4894,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,B
4895,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
4896,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,B


In [102]:
# since there is no wine sample denoted by 1.0 in sulphates category, 
# 8th column no longer carries valuable information for further analysis.

white_wine_df = white_wine_df.drop(axis =1 , columns = 'sulphates')
white_wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,alcohol,quality
0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,B
1,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
2,3.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,B
3,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...
4893,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4894,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4895,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,B
4896,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,B


In [103]:
header_white

array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
       '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
       '"alcohol"', '"quality"'], dtype='<U22')

In [104]:
white_wine_df = white_wine_df.rename(columns = {"fixed acidity": "Fixed Acidity", 
                                                "volatile acidity": "Volatile Acidity",
                                                "citric acid": "Citric Acid", "residual sugar": "Residual Sugar",
                                                "chlorides": "Chlorides", "free sulfur dioxide": "Free SO2",
                                                "total sulfur dioxide": "Total SO2", "pH": "pH", 
                                                "alcohol": "Alcohol", "quality": "Quality Grade"})

In [105]:
white_wine_df

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol,Quality Grade
0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,B
1,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
2,3.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,B
3,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...
4893,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4894,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4895,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,B
4896,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,B


In [106]:
white_wine_df.to_csv("White-Wine-Preprocessed-V2.csv", sep = ',')

In [107]:
white_wine_df2 = pd.read_csv("White-Wine-Preprocessed-V2.csv", delimiter = ',', index_col = 0)
white_wine_df2

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol,Quality Grade
0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,B
1,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
2,3.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,B
3,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...
4893,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4894,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
4895,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,B
4896,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,B


In [108]:
white_wine_sorted = white_wine_df2.sort_values('Quality Grade')

In [109]:
white_wine_sorted

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol,Quality Grade
3664,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,A
4340,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A
3029,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,A
2774,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,A
1412,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,A
...,...,...,...,...,...,...,...,...,...,...
1931,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,C
3810,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,C
2373,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,C
873,4.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,C


In [110]:
white_wine_sorted.describe()

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,1.876684,0.164965,0.990608,0.179257,0.145365,0.168436,0.999592,0.025112,0.305635
std,0.570444,0.448879,0.096464,0.383606,0.352505,0.374291,0.020205,0.156482,0.647242
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,4.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [111]:
# seperating main table into smaller tables based on quality grouping

quality_A = white_wine_sorted[white_wine_sorted['Quality Grade'] == 'A']
quality_A

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol,Quality Grade
3664,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,A
4340,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A
3029,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,A
2774,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,A
1412,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,A
...,...,...,...,...,...,...,...,...,...,...
1358,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,A
1779,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A
2804,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A
672,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,A


In [112]:
quality_B = white_wine_sorted[white_wine_sorted['Quality Grade'] == 'B']
quality_B

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol,Quality Grade
3347,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
3278,2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,B
3335,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,B
3280,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
3334,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...
1678,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,B
1677,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B
1676,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,B
1675,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B


In [113]:
quality_C = white_wine_sorted[white_wine_sorted['Quality Grade'] == 'C']
quality_C

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol,Quality Grade
4745,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,C
1417,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,C
3307,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,C
3087,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,C
740,2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,C
253,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,C
1484,2.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,C
1034,2.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,C
2050,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,C
251,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,C


In [114]:
quality_A.describe()

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,1.794444,0.188889,0.988889,0.15,0.111111,0.227778,1.0,0.011111,0.977778
std,0.535836,0.481952,0.105114,0.358067,0.315146,0.420568,0.0,0.105114,0.884096
min,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0
max,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [115]:
quality_B.describe()

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol
count,4698.0,4698.0,4698.0,4698.0,4698.0,4698.0,4698.0,4698.0,4698.0
mean,1.878459,0.162835,0.990634,0.179864,0.146871,0.166667,1.0,0.025756,0.280119
std,0.569167,0.445552,0.096332,0.384115,0.354015,0.372718,0.0,0.158422,0.622371
min,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,4.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [116]:
quality_C.describe()

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free SO2,Total SO2,pH,Alcohol
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,2.2,0.45,1.0,0.3,0.1,0.05,0.9,0.0,0.25
std,0.951453,0.759155,0.0,0.470162,0.307794,0.223607,0.307794,0.0,0.638666
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.75,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,3.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
max,4.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0


In [117]:
quality_A.to_csv("Grade-A-White-Wine-Preprocessed.csv", sep = ',', index = False)

In [118]:
quality_B.to_csv("Grade-B-White-Wine-Preprocessed.csv", sep = ',', index = False)

In [119]:
quality_C.to_csv("Grade-C-White-Wine-Preprocessed.csv", sep = ',', index = False)

In [120]:
raw_white_wine_2 = np.genfromtxt("winequality-white.csv", delimiter = ';', skip_header = 1)
raw_white_wine_2

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]])

In [121]:
# The quality of the wines is a score between 1 and 10.
# low quality(0,1,2,3)-- denoted by 0.
# avg. quality(4,5,6,7)-- denoted by 1.
# high quality(8,9,10)-- denoted by 2.

low_quality = np.array([0,1,2,3])
avg_quality = np.array([4,5,6,7])
high_quality = np.array([8,9,10])

In [122]:
raw_white_wine_2[:,-1] = np.where(np.isin(raw_white_wine_2[:,-1], low_quality), 0, raw_white_wine_2[:,-1])

In [123]:
raw_white_wine_2[:,-1] = np.where(np.isin(raw_white_wine_2[:,-1], avg_quality), 1, raw_white_wine_2[:,-1])
raw_white_wine_2[:,-1] = np.where(np.isin(raw_white_wine_2[:,-1], high_quality), 2, raw_white_wine_2[:,-1])

In [124]:
np.unique(raw_white_wine_2[:,-1])

array([0., 1., 2.])

In [125]:
raw_white_wine_2_w_headers = np.vstack((header_white, raw_white_wine_2))
raw_white_wine_2_w_headers

array([['"fixed acidity"', '"volatile acidity"', '"citric acid"', ..., '"sulphates"', '"alcohol"',
        '"quality"'],
       ['7.0', '0.27', '0.36', ..., '0.45', '8.8', '1.0'],
       ['6.3', '0.3', '0.34', ..., '0.49', '9.5', '1.0'],
       ...,
       ['6.5', '0.24', '0.19', ..., '0.46', '9.4', '1.0'],
       ['5.5', '0.29', '0.3', ..., '0.38', '12.8', '1.0'],
       ['6.0', '0.21', '0.38', ..., '0.32', '11.8', '1.0']], dtype='<U32')

In [126]:
grade_A = np.array(['2.0'])
grade_B = np.array(['1.0'])
grade_C = np.array(['0.0'])

In [127]:
raw_white_wine_2_w_headers[:,-1] = np.where(np.isin(raw_white_wine_2_w_headers[:,-1], grade_A), 
                                            'A', raw_white_wine_2_w_headers[:,-1])

In [128]:
raw_white_wine_2_w_headers[:,-1] = np.where(np.isin(raw_white_wine_2_w_headers[:,-1], grade_B), 
                                            'B', raw_white_wine_2_w_headers[:,-1])

In [129]:
raw_white_wine_2_w_headers[:,-1] = np.where(np.isin(raw_white_wine_2_w_headers[:,-1], grade_C), 
                                            'C', raw_white_wine_2_w_headers[:,-1])

In [130]:
np.unique(raw_white_wine_2_w_headers[:,-1])

array(['"quality"', 'A', 'B', 'C'], dtype='<U32')

In [131]:
np.savetxt("Raw-White-Wine-w-Grades.csv", raw_white_wine_2_w_headers, fmt = '%s', delimiter = ',')