In [1]:
import numpy as np
import pandas as pd

In [2]:
np.set_printoptions(suppress = True, linewidth = 100)

In [3]:
raw_redwine = np.genfromtxt("winequality-red.csv", delimiter = ';', skip_header = 1)
raw_redwine

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

In [4]:
raw_redwine.shape

(1599, 12)

In [None]:
# fixed acidity (g(tartaric acid)/L), volatile acidity (g(acetic acid)/L), citric acid(g/L)
# residual sugar (g/L), chlorides(g(sodium chloride)/L), free sulfur dioxide (mg/L), total sulfur dioxide (mg/L)
# density (g/ml), pH , sulphates(g(potassium sulphate)/L)
# alcohol ( % vol.).

In [5]:
header_red = np.genfromtxt("winequality-red.csv", delimiter = ';', skip_footer = raw_redwine.shape[0], autostrip = True, dtype = 'str')
header_red

array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
       '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
       '"alcohol"', '"quality"'], dtype='<U22')

In [6]:
def checkpoint(file_name, checkpoint_header, checkpoint_data):
    np.savez(file_name, header = checkpoint_header, data = checkpoint_data)
    checkpoint_variable = np.load(file_name + ".npz")
    return(checkpoint_variable)

In [7]:
RedWine_checkpoint_1 = checkpoint("RedWine-1st-checkpoint", header_red, raw_redwine)

In [8]:
RedWine_checkpoint_1['header'],RedWine_checkpoint_1['data']

(array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
        '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
        '"alcohol"', '"quality"'], dtype='<U22'),
 array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
        [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
        [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
        ...,
        [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
        [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
        [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]]))

In [9]:
# screeening for missing values

np.isnan(raw_redwine).sum()

0

In [10]:
stats_redwine = np.array([np.min(raw_redwine[:,0]), np.max(raw_redwine[:,0])])
stats_redwine

array([ 4.6, 15.9])

In [11]:
np.unique(raw_redwine[:,0])

array([ 4.6,  4.7,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,  5.6,  5.7,  5.8,  5.9,  6. ,  6.1,
        6.2,  6.3,  6.4,  6.5,  6.6,  6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,
        7.7,  7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,  8.9,  9. ,  9.1,
        9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9, 10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6,
       10.7, 10.8, 10.9, 11. , 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
       12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.2, 13.3, 13.4, 13.5, 13.7, 13.8,
       14. , 14.3, 15. , 15.5, 15.6, 15.9])

In [12]:
# grouping fixed acidity into 4 stages: 
# 1-- low value of fixed acidity(4.5-6.9)
# 2-- notr value of fixed acidity(7.0-7.9)
# 3-- high value of fixed acidity(8.0-11.9)
# 4-- very high value of fixed acidity(12.0-15.9)

low_value = np.array([ 4.6,  4.7,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,  5.6,  5.7,  5.8,  5.9,  
                      6. ,  6.1, 6.2,  6.3,  6.4,  6.5,  6.6,  6.7,  6.8,  6.9])
notr_value = np.array([7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6, 7.7,  7.8,  7.9])
high_value = np.array([8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,  8.9,  
                       9. ,  9.1, 9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9, 
                       10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 
                       11. , 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9])
very_high_value = np.array([12. , 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 
                            13. , 13.2, 13.3, 13.4, 13.5, 13.7, 13.8, 14. , 14.3, 15. , 15.5, 15.6, 15.9])

In [13]:
raw_redwine[:,0] = np.where(np.isin(raw_redwine[:,0], low_value), 1, raw_redwine[:,0])
raw_redwine[:,0] = np.where(np.isin(raw_redwine[:,0], notr_value), 2, raw_redwine[:,0])
raw_redwine[:,0] = np.where(np.isin(raw_redwine[:,0], high_value), 3, raw_redwine[:,0])
raw_redwine[:,0] = np.where(np.isin(raw_redwine[:,0], very_high_value), 4, raw_redwine[:,0])

In [14]:
np.unique(raw_redwine[:,0])

array([1., 2., 3., 4.])

In [15]:
len(raw_redwine[:,0])

1599

In [16]:
header_red

array(['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"',
       '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"',
       '"alcohol"', '"quality"'], dtype='<U22')

In [17]:
np.unique(raw_redwine[:,1])

array([0.12 , 0.16 , 0.18 , 0.19 , 0.2  , 0.21 , 0.22 , 0.23 , 0.24 , 0.25 , 0.26 , 0.27 , 0.28 ,
       0.29 , 0.295, 0.3  , 0.305, 0.31 , 0.315, 0.32 , 0.33 , 0.34 , 0.35 , 0.36 , 0.365, 0.37 ,
       0.38 , 0.39 , 0.395, 0.4  , 0.41 , 0.415, 0.42 , 0.43 , 0.44 , 0.45 , 0.46 , 0.47 , 0.475,
       0.48 , 0.49 , 0.5  , 0.51 , 0.52 , 0.53 , 0.54 , 0.545, 0.55 , 0.56 , 0.565, 0.57 , 0.575,
       0.58 , 0.585, 0.59 , 0.595, 0.6  , 0.605, 0.61 , 0.615, 0.62 , 0.625, 0.63 , 0.635, 0.64 ,
       0.645, 0.65 , 0.655, 0.66 , 0.665, 0.67 , 0.675, 0.68 , 0.685, 0.69 , 0.695, 0.7  , 0.705,
       0.71 , 0.715, 0.72 , 0.725, 0.73 , 0.735, 0.74 , 0.745, 0.75 , 0.755, 0.76 , 0.765, 0.77 ,
       0.775, 0.78 , 0.785, 0.79 , 0.795, 0.8  , 0.805, 0.81 , 0.815, 0.82 , 0.825, 0.83 , 0.835,
       0.84 , 0.845, 0.85 , 0.855, 0.86 , 0.865, 0.87 , 0.875, 0.88 , 0.885, 0.89 , 0.895, 0.9  ,
       0.91 , 0.915, 0.92 , 0.935, 0.95 , 0.955, 0.96 , 0.965, 0.975, 0.98 , 1.   , 1.005, 1.01 ,
       1.02 , 1.025,

In [18]:
# The average volatile acidity value for red table wines during this period is about 0.60 g/L.
# data for volatile acidity of red wine will be calssified into 3 classes: 0-- below average(0.100-0.530)
# 1-- average (0.540- 0.690)
# 2-- above average (0.695-1.580)

below_average = np.array([0.12 , 0.16 , 0.18 , 0.19 , 0.2  , 0.21 , 0.22 , 0.23 , 0.24 , 
                          0.25 , 0.26 , 0.27 , 0.28 , 0.29 , 0.295, 0.3  , 0.305, 0.31 , 0.315, 
                          0.32 , 0.33 , 0.34 , 0.35 , 0.36 , 0.365, 0.37 , 0.38 , 0.39 , 0.395, 
                          0.4  , 0.41 , 0.415, 0.42 , 0.43 , 0.44 , 0.45 , 0.46 , 0.47 , 0.475, 0.48 , 0.49 , 
                          0.5  , 0.51 , 0.52 , 0.53])
average = np.array([0.54 , 0.545, 0.55 , 0.56 , 0.565, 0.57 , 0.575, 0.58 , 0.585, 0.59 , 0.595, 
                    0.6  , 0.605, 0.61 , 0.615, 0.62 , 0.625, 0.63 , 0.635, 0.64 , 0.645, 
                    0.65 , 0.655, 0.66 , 0.665, 0.67 , 0.675, 0.68 , 0.685, 0.69])
above_average = np.array([0.695, 0.7  , 0.705, 0.71 , 0.715, 0.72 , 0.725, 0.73 , 0.735, 
                          0.74 , 0.745, 0.75 , 0.755, 0.76 , 0.765, 0.77 , 0.775, 0.78 , 0.785, 
                          0.79 , 0.795, 0.8  , 0.805, 0.81 , 0.815, 0.82 , 0.825, 0.83 , 0.835, 
                          0.84 , 0.845, 0.85 , 0.855, 0.86 , 0.865, 0.87 , 0.875, 0.88 , 0.885, 
                          0.89 , 0.895, 0.9  , 0.91 , 0.915, 0.92 , 0.935, 0.95 , 0.955, 
                          0.96 , 0.965, 0.975, 0.98 , 1.   , 1.005, 1.01 , 1.02 , 1.025, 1.035, 
                          1.04 , 1.07 , 1.09 , 1.115, 1.13 , 1.18 , 1.185, 1.24 , 1.33 , 1.58])

In [19]:
raw_redwine[:,1] = np.where(np.isin(raw_redwine[:,1], below_average), 0, raw_redwine[:,1])

In [20]:
raw_redwine[:,1] = np.where(np.isin(raw_redwine[:,1], above_average), 2, raw_redwine[:,1])

In [21]:
raw_redwine[:,1] = np.where(np.isin(raw_redwine[:,1], average), 1, raw_redwine[:,1])

In [22]:
np.unique(raw_redwine[:,1])

array([0., 1., 2.])

In [23]:
raw_redwine[:,1]

array([2., 2., 2., ..., 0., 1., 0.])

In [24]:
header_red[2]

'"citric acid"'

In [25]:
np.unique(raw_redwine[:,2], return_counts = True)

(array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14,
        0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29,
        0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
        0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
        0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74,
        0.75, 0.76, 0.78, 0.79, 1.  ]),
 array([132,  33,  50,  30,  29,  20,  24,  22,  33,  30,  35,  15,  27,  18,  21,  19,   9,  16,
         22,  21,  25,  33,  27,  25,  51,  27,  38,  20,  19,  21,  30,  30,  32,  25,  24,  13,
         20,  19,  14,  28,  29,  16,  29,  15,  23,  22,  19,  18,  23,  68,  20,  13,  17,  14,
         13,  12,   8,   9,   9,   8,   9,   2,   1,  10,   9,   7,  14,   2,  11,   4,   2,   1,
          1,   3,   4,   1,   3,   1,   1,   1]))

In [26]:
# It can be added to finished wines to increase acidity and give a “fresh" flavor. 
# The disadvantage of adding citric acid is its microbial instability. 
# we will classify citric acid(cc) data column according to its existence: 0-- will be denoted to not added
# 1-- citric acid addded
#!! for non exist- the range is applied since 0.04 can be just noise due to instrumental measurment

cc_not_exist = np.array([0.  , 0.01, 0.02, 0.03])
cc_exist = np.array([0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 
                     0.1 , 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 
                     0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 
                     0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 
                     0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 
                     0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 
                     0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 
                     0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.78, 0.79, 1.])

In [27]:
raw_redwine[:,2] = np.where(np.isin(raw_redwine[:,2], cc_not_exist), 0, raw_redwine[:,2])
raw_redwine[:,2] = np.where(np.isin(raw_redwine[:,2], cc_exist), 1, raw_redwine[:,2])

In [28]:
np.unique(raw_redwine[:,2], return_counts = True)

(array([0., 1.]), array([ 245, 1354]))

In [29]:
header_red[3]

'"residual sugar"'

In [30]:
raw_redwine[:,3]

array([1.9, 2.6, 2.3, ..., 2.3, 2. , 3.6])

In [31]:
np.unique(raw_redwine[:,3], return_counts = True)

(array([ 0.9 ,  1.2 ,  1.3 ,  1.4 ,  1.5 ,  1.6 ,  1.65,  1.7 ,  1.75,  1.8 ,  1.9 ,  2.  ,  2.05,
         2.1 ,  2.15,  2.2 ,  2.25,  2.3 ,  2.35,  2.4 ,  2.5 ,  2.55,  2.6 ,  2.65,  2.7 ,  2.8 ,
         2.85,  2.9 ,  2.95,  3.  ,  3.1 ,  3.2 ,  3.3 ,  3.4 ,  3.45,  3.5 ,  3.6 ,  3.65,  3.7 ,
         3.75,  3.8 ,  3.9 ,  4.  ,  4.1 ,  4.2 ,  4.25,  4.3 ,  4.4 ,  4.5 ,  4.6 ,  4.65,  4.7 ,
         4.8 ,  5.  ,  5.1 ,  5.15,  5.2 ,  5.4 ,  5.5 ,  5.6 ,  5.7 ,  5.8 ,  5.9 ,  6.  ,  6.1 ,
         6.2 ,  6.3 ,  6.4 ,  6.55,  6.6 ,  6.7 ,  7.  ,  7.2 ,  7.3 ,  7.5 ,  7.8 ,  7.9 ,  8.1 ,
         8.3 ,  8.6 ,  8.8 ,  8.9 ,  9.  , 10.7 , 11.  , 12.9 , 13.4 , 13.8 , 13.9 , 15.4 , 15.5 ]),
 array([  2,   8,   5,  35,  30,  58,   2,  76,   2, 129, 117, 156,   2, 128,   2, 131,   1, 109,
          1,  86,  84,   1,  79,   1,  39,  49,   1,  24,   1,  25,   7,  15,  11,  15,   1,   2,
          8,   1,   4,   1,   8,   6,  11,   6,   5,   1,   8,   4,   4,   6,   2,   1,   3,   1,
          5

In [32]:
# Residual sugar levels vary in different styles of wine. 
# ‘Dry’ wines: 0-4 g/L-- denoted by 0 
# ‘Sweet’ wines: 35 g/L and then go up from there -- denoted by 1

sweet_wine = np.array(range(35,100))

In [33]:
sweet_wine

array([35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
       58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [34]:
raw_redwine[:,3] = np.where(np.isin(raw_redwine[:,3], sweet_wine), 1, 0)

In [35]:
np.unique(raw_redwine[:,3], return_counts = True)

(array([0.]), array([1599]))

In [36]:
header_red[4]

'"chlorides"'

In [37]:
np.unique(raw_redwine[:,4], return_counts = True)

(array([0.012, 0.034, 0.038, 0.039, 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049,
        0.05 , 0.051, 0.052, 0.053, 0.054, 0.055, 0.056, 0.057, 0.058, 0.059, 0.06 , 0.061, 0.062,
        0.063, 0.064, 0.065, 0.066, 0.067, 0.068, 0.069, 0.07 , 0.071, 0.072, 0.073, 0.074, 0.075,
        0.076, 0.077, 0.078, 0.079, 0.08 , 0.081, 0.082, 0.083, 0.084, 0.085, 0.086, 0.087, 0.088,
        0.089, 0.09 , 0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097, 0.098, 0.099, 0.1  , 0.101,
        0.102, 0.103, 0.104, 0.105, 0.106, 0.107, 0.108, 0.109, 0.11 , 0.111, 0.112, 0.113, 0.114,
        0.115, 0.116, 0.117, 0.118, 0.119, 0.12 , 0.121, 0.122, 0.123, 0.124, 0.125, 0.126, 0.127,
        0.128, 0.132, 0.136, 0.137, 0.143, 0.145, 0.146, 0.147, 0.148, 0.152, 0.153, 0.157, 0.159,
        0.161, 0.165, 0.166, 0.168, 0.169, 0.17 , 0.171, 0.172, 0.174, 0.176, 0.178, 0.186, 0.19 ,
        0.194, 0.2  , 0.205, 0.213, 0.214, 0.216, 0.222, 0.226, 0.23 , 0.235, 0.236, 0.241, 0.243,
        0.

In [38]:
# Chlorides (sodium chloride) give the wine a salty flavor which may turn away potential consumers. 
# The chlorides max conc. in wine is about 0.20 - 0.60 g/L.
# Not salty (below_maximum) -- denoted by 0
# Salty (max. conc.) -- denoted by 1

salty = np.array([0.2  , 0.205, 0.213, 0.214, 0.216, 0.222, 0.226, 
                  0.23 , 0.235, 0.236, 0.241, 0.243, 0.25 , 0.263, 0.267, 
                  0.27 , 0.332, 0.337, 0.341, 0.343, 0.358, 0.36 , 0.368, 0.369, 
                  0.387, 0.401, 0.403, 0.413, 0.414, 0.415, 0.422, 0.464, 0.467, 0.61 , 0.611])

In [39]:
raw_redwine[:,4] = np.where(np.isin(raw_redwine[:,4], salty), 1, 0)

In [40]:
np.unique(raw_redwine[:,4], return_counts = True)

(array([0., 1.]), array([1557,   42]))

In [41]:
header_red[5]

'"free sulfur dioxide"'

In [42]:
np.unique(raw_redwine[:,5], return_counts = True)

(array([ 1. ,  2. ,  3. ,  4. ,  5. ,  5.5,  6. ,  7. ,  8. ,  9. , 10. , 11. , 12. , 13. , 14. ,
        15. , 16. , 17. , 18. , 19. , 20. , 21. , 22. , 23. , 24. , 25. , 26. , 27. , 28. , 29. ,
        30. , 31. , 32. , 33. , 34. , 35. , 36. , 37. , 37.5, 38. , 39. , 40. , 40.5, 41. , 42. ,
        43. , 45. , 46. , 47. , 48. , 50. , 51. , 52. , 53. , 54. , 55. , 57. , 66. , 68. , 72. ]),
 array([  3,   1,  49,  41, 104,   1, 138,  71,  56,  62,  79,  59,  75,  57,  50,  78,  61,  60,
         46,  39,  30,  41,  22,  32,  34,  24,  32,  29,  23,  23,  16,  20,  22,  11,  18,  15,
         11,   3,   2,   9,   5,   6,   1,   7,   3,   3,   3,   1,   1,   4,   2,   4,   3,   1,
          1,   2,   1,   1,   2,   1]))

In [43]:
# SO2 and its sulfite salts are essential for anti-oxidation and preservation properties. 
# SO2 Total = SO2 free + SO2 reacted.
# free SO2 values of 25 mg/L on red wine is recommended.
# not recommended denoted by 0 
# recommended denoted by 1-- (25-3, 25+3)

recommended = np.array([22. , 23. , 24. , 25. , 26. , 27. , 28.])

In [44]:
raw_redwine[:,5] = np.where(np.isin(raw_redwine[:,5], recommended), 1, 0)

In [45]:
np.unique(raw_redwine[:,5], return_counts = True)

(array([0., 1.]), array([1403,  196]))

In [46]:
header_red[6]

'"total sulfur dioxide"'

In [47]:
np.unique(raw_redwine[:,6], return_counts = True)

(array([  6. ,   7. ,   8. ,   9. ,  10. ,  11. ,  12. ,  13. ,  14. ,  15. ,  16. ,  17. ,  18. ,
         19. ,  20. ,  21. ,  22. ,  23. ,  24. ,  25. ,  26. ,  27. ,  28. ,  29. ,  30. ,  31. ,
         32. ,  33. ,  34. ,  35. ,  36. ,  37. ,  38. ,  39. ,  40. ,  41. ,  42. ,  43. ,  44. ,
         45. ,  46. ,  47. ,  48. ,  49. ,  50. ,  51. ,  52. ,  53. ,  54. ,  55. ,  56. ,  57. ,
         58. ,  59. ,  60. ,  61. ,  62. ,  63. ,  64. ,  65. ,  66. ,  67. ,  68. ,  69. ,  70. ,
         71. ,  72. ,  73. ,  74. ,  75. ,  76. ,  77. ,  77.5,  78. ,  79. ,  80. ,  81. ,  82. ,
         83. ,  84. ,  85. ,  86. ,  87. ,  88. ,  89. ,  90. ,  91. ,  92. ,  93. ,  94. ,  95. ,
         96. ,  98. ,  99. , 100. , 101. , 102. , 103. , 104. , 105. , 106. , 108. , 109. , 110. ,
        111. , 112. , 113. , 114. , 115. , 116. , 119. , 120. , 121. , 122. , 124. , 125. , 126. ,
        127. , 128. , 129. , 130. , 131. , 133. , 134. , 135. , 136. , 139. , 140. , 141. , 142. ,
        14

In [48]:
# EU allows red wine to have maximum total sulfur dioxide between 150 and 200 mg/L.
# not recommended 150-200 denoted by 0.

not_recommended_SO2 = np.array([151. , 152. , 153. , 155. , 160. , 165. , 278. , 289.])

In [49]:
raw_redwine[:,6] = np.where(np.isin(raw_redwine[:,6], not_recommended_SO2), 0, 1)

In [50]:
np.unique(raw_redwine[:,6], return_counts = True)

(array([0., 1.]), array([   9, 1590]))

In [51]:
header_red[7]

'"density"'

In [52]:
np.unique(raw_redwine[:,7], return_counts = True)

(array([0.99007, 0.9902 , 0.99064, 0.9908 , 0.99084, 0.9912 , 0.9915 , 0.99154, 0.99157, 0.9916 ,
        0.99162, 0.9917 , 0.99182, 0.99191, 0.9921 , 0.9922 , 0.99235, 0.99236, 0.9924 , 0.99242,
        0.99252, 0.99256, 0.99258, 0.99264, 0.9927 , 0.9928 , 0.99286, 0.9929 , 0.99292, 0.99294,
        0.99306, 0.99314, 0.99316, 0.99318, 0.9932 , 0.99322, 0.99323, 0.99328, 0.9933 , 0.99331,
        0.99332, 0.99334, 0.99336, 0.9934 , 0.99341, 0.99344, 0.99346, 0.99348, 0.9935 , 0.99352,
        0.99354, 0.99356, 0.99357, 0.99358, 0.9936 , 0.99362, 0.99364, 0.9937 , 0.99371, 0.99374,
        0.99376, 0.99378, 0.99379, 0.9938 , 0.99384, 0.99385, 0.99386, 0.99387, 0.99388, 0.99392,
        0.99394, 0.99395, 0.99396, 0.99397, 0.994  , 0.99402, 0.99408, 0.9941 , 0.99414, 0.99416,
        0.99417, 0.99418, 0.99419, 0.9942 , 0.99425, 0.99426, 0.99428, 0.9943 , 0.99434, 0.99437,
        0.99438, 0.99439, 0.9944 , 0.99444, 0.99448, 0.99451, 0.99454, 0.99456, 0.99458, 0.99459,
        0.9946 , 0.9

In [53]:
# Density is the mass per unit volume of wine. 
# After fermentation is complete, the wine should be at, or slightly less than 1.00.
# above threshold is denoted by 0.
# correct density range is denoted by 1.

above_threshold = np.arange(1.1, 2.1, 0.000001)

In [54]:
above_threshold

array([1.1     , 1.100001, 1.100002, ..., 2.099997, 2.099998, 2.099999])

In [55]:
raw_redwine[:,7] = np.where(np.isin(raw_redwine[:,7], above_threshold), 0, 1)

In [56]:
np.unique(raw_redwine[:,7], return_counts = True)

(array([1.]), array([1599]))

In [57]:
header_red[8]

'"pH"'

In [58]:
np.unique(raw_redwine[:,8], return_counts = True)

(array([2.74, 2.86, 2.87, 2.88, 2.89, 2.9 , 2.92, 2.93, 2.94, 2.95, 2.98, 2.99, 3.  , 3.01, 3.02,
        3.03, 3.04, 3.05, 3.06, 3.07, 3.08, 3.09, 3.1 , 3.11, 3.12, 3.13, 3.14, 3.15, 3.16, 3.17,
        3.18, 3.19, 3.2 , 3.21, 3.22, 3.23, 3.24, 3.25, 3.26, 3.27, 3.28, 3.29, 3.3 , 3.31, 3.32,
        3.33, 3.34, 3.35, 3.36, 3.37, 3.38, 3.39, 3.4 , 3.41, 3.42, 3.43, 3.44, 3.45, 3.46, 3.47,
        3.48, 3.49, 3.5 , 3.51, 3.52, 3.53, 3.54, 3.55, 3.56, 3.57, 3.58, 3.59, 3.6 , 3.61, 3.62,
        3.63, 3.66, 3.67, 3.68, 3.69, 3.7 , 3.71, 3.72, 3.74, 3.75, 3.78, 3.85, 3.9 , 4.01]),
 array([ 1,  1,  1,  2,  4,  1,  4,  3,  4,  1,  5,  2,  6,  5,  8,  6, 10,  8, 10, 11, 11, 11, 19,
         9, 20, 13, 21, 34, 36, 27, 30, 25, 39, 36, 39, 32, 29, 26, 53, 35, 42, 46, 57, 39, 45, 37,
        43, 39, 56, 37, 48, 48, 37, 34, 33, 17, 29, 20, 22, 21, 19, 10, 14, 15, 18, 17, 16,  8, 11,
        10, 10,  8,  7,  8,  4,  3,  4,  3,  5,  4,  1,  4,  3,  1,  1,  2,  1,  2,  2]))

In [59]:
# pH level of a wine ranges from 3 to 4.
# normal_ph_range is denoted by 1.
# acidic than normal is denoted by 0.

normal_ph_range = np.arange(3.00, 4.1, 0.01)

In [60]:
raw_redwine[:,8] = np.where(np.isin(raw_redwine[:,8], normal_ph_range), 1, 0)

In [61]:
np.unique(raw_redwine[:,8], return_counts = True)

(array([0., 1.]), array([1588,   11]))

In [62]:
header_red[9]

'"sulphates"'

In [63]:
np.unique(raw_redwine[:,9], return_counts = True)

(array([0.33, 0.37, 0.39, 0.4 , 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52,
        0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67,
        0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82,
        0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97,
        0.98, 0.99, 1.  , 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1 , 1.11, 1.12,
        1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.2 , 1.22, 1.26, 1.28, 1.31, 1.33, 1.34, 1.36, 1.56,
        1.59, 1.61, 1.62, 1.95, 1.98, 2.  ]),
 array([ 1,  2,  6,  4,  5,  8, 16, 12, 18, 19, 29, 31, 27, 26, 47, 51, 68, 50, 60, 55, 68, 51, 69,
        45, 61, 48, 46, 41, 42, 36, 35, 23, 33, 26, 28, 26, 26, 20, 25, 26, 23, 18, 19, 15, 22, 15,
        13, 14, 13, 13,  7,  7,  8,  8,  5, 10,  4,  2,  3,  6,  2,  3,  1,  1,  3,  2,  2,  3,  4,
         2,  3,  1,  2,  1,  1,  2,  2,  1,  1,  5,  3,  1,  1,  1

In [64]:
# Sulfites are a food preservative common in winemaking, since it maintain wine's flavor and freshness.
# They’re assoc. w/ long list of side effects like the dreaded wine-induced headache. 
# Most bottled dry red wines have around 0.5-0.75 g/L. 
# Since our dataset only contains dry red wine, based on range above, we defined following classification.
# normal range-- denoted by 1.
# out of the range -- denoted by 0.

normal_range = np.arange(0.50, 0.76, 0.01)

In [65]:
raw_redwine[:,9] = np.where(np.isin(raw_redwine[:,9], normal_range), 1, 0)

In [66]:
np.unique(raw_redwine[:,9], return_counts = True)

(array([0., 1.]), array([1270,  329]))

In [67]:
header_red[10]

'"alcohol"'

In [68]:
np.unique(raw_redwine[:,10], return_counts = True)

(array([ 8.4       ,  8.5       ,  8.7       ,  8.8       ,  9.        ,  9.05      ,  9.1       ,
         9.2       ,  9.23333333,  9.25      ,  9.3       ,  9.4       ,  9.5       ,  9.55      ,
         9.56666667,  9.6       ,  9.7       ,  9.8       ,  9.9       ,  9.95      , 10.        ,
        10.03333333, 10.1       , 10.2       , 10.3       , 10.4       , 10.5       , 10.55      ,
        10.6       , 10.7       , 10.75      , 10.8       , 10.9       , 11.        , 11.06666667,
        11.1       , 11.2       , 11.3       , 11.4       , 11.5       , 11.6       , 11.7       ,
        11.8       , 11.9       , 11.95      , 12.        , 12.1       , 12.2       , 12.3       ,
        12.4       , 12.5       , 12.6       , 12.7       , 12.8       , 12.9       , 13.        ,
        13.1       , 13.2       , 13.3       , 13.4       , 13.5       , 13.56666667, 13.6       ,
        14.        , 14.9       ]),
 array([  2,   1,   2,   2,  30,   1,  23,  72,   1,   1,  59, 103, 139, 

In [69]:
# since data (above) has values with many decimal points, 
# we use rounding for the readibility of command and analysis. 

raw_redwine[:,10] = np.round(raw_redwine[:,10], decimals = 2)

In [70]:
np.unique(raw_redwine[:,10], return_counts = True)

(array([ 8.4 ,  8.5 ,  8.7 ,  8.8 ,  9.  ,  9.05,  9.1 ,  9.2 ,  9.23,  9.25,  9.3 ,  9.4 ,  9.5 ,
         9.55,  9.57,  9.6 ,  9.7 ,  9.8 ,  9.9 ,  9.95, 10.  , 10.03, 10.1 , 10.2 , 10.3 , 10.4 ,
        10.5 , 10.55, 10.6 , 10.7 , 10.75, 10.8 , 10.9 , 11.  , 11.07, 11.1 , 11.2 , 11.3 , 11.4 ,
        11.5 , 11.6 , 11.7 , 11.8 , 11.9 , 11.95, 12.  , 12.1 , 12.2 , 12.3 , 12.4 , 12.5 , 12.6 ,
        12.7 , 12.8 , 12.9 , 13.  , 13.1 , 13.2 , 13.3 , 13.4 , 13.5 , 13.57, 13.6 , 14.  , 14.9 ]),
 array([  2,   1,   2,   2,  30,   1,  23,  72,   1,   1,  59, 103, 139,   2,   1,  59,  54,  78,
         49,   1,  67,   2,  47,  46,  33,  41,  67,   2,  28,  27,   1,  42,  49,  59,   1,  27,
         36,  32,  32,  30,  15,  23,  29,  20,   1,  21,  13,  12,  12,  13,  21,   6,   9,  17,
          9,   6,   2,   1,   3,   3,   1,   1,   4,   7,   1]))

In [71]:
# Wine can have anywhere between 5% and 23% Alcohol by Volume (ABV). 
# The avg alcohol content of wine is about 12%. 
# less than 11%-- light group which is denoted by 0.
# around 12% -- medium group denoted by 1.
# higher than 12% -- heavier group denoted by 2.

light_group = np.array([8.4 ,  8.5 ,  8.7 ,  8.8 ,  9.  ,  9.05,  9.1 ,  
                        9.2 ,  9.23,  9.25,  9.3 ,  9.4 ,  9.5 , 9.55,  9.57,  
                        9.6 ,  9.7 ,  9.8 ,  9.9 ,  9.95, 10.  , 10.03, 
                        10.1 , 10.2 , 10.3 , 10.4 , 10.5 , 10.55, 10.6 , 10.7 , 10.75, 
                        10.8 , 10.9 , 11. , 11.07, 11.1 , 11.2])
medium_group = np.array([11.3 , 11.4 , 11.5 , 11.6 , 11.7 , 11.8 , 11.9 , 11.95, 12.  , 12.1 , 12.2])
heavy_group = np.array([12.3 , 12.4 , 12.5 , 12.6 , 12.7 , 12.8 , 12.9 , 
                        13.  , 13.1 , 13.2 , 13.3 , 13.4 , 13.5 , 13.57, 13.6 , 14.  , 14.9 ])

In [72]:
raw_redwine[:,10] = np.where(np.isin(raw_redwine[:,10], light_group), 0, raw_redwine[:,10])

In [73]:
raw_redwine[:,10] = np.where(np.isin(raw_redwine[:,10], medium_group), 1, raw_redwine[:,10])

In [74]:
raw_redwine[:,10] = np.where(np.isin(raw_redwine[:,10], heavy_group), 2, raw_redwine[:,10])

In [75]:
np.unique(raw_redwine[:,10], return_counts = True)

(array([0., 1., 2.]), array([1255,  228,  116]))

In [76]:
header_red[11]

'"quality"'

In [77]:
np.unique(raw_redwine[:,11], return_counts = True)

(array([3., 4., 5., 6., 7., 8.]), array([ 10,  53, 681, 638, 199,  18]))

In [78]:
# The quality of the wines is a score between 1 and 10.
# low quality(0,1,2,3)-- denoted by 0.
# avg. quality(4,5,6,7)-- denoted by 1.
# high quality(8,9,10)-- denoted by 2.

low_quality = np.array([0,1,2,3])
avg_quality = np.array([4,5,6,7])
high_quality = np.array([8,9,10])

In [79]:
raw_redwine[:,11] = np.where(np.isin(raw_redwine[:,11], low_quality), 0, raw_redwine[:,11])

In [80]:
raw_redwine[:,11] = np.where(np.isin(raw_redwine[:,11], avg_quality), 1, raw_redwine[:,11])

In [81]:
raw_redwine[:,11] = np.where(np.isin(raw_redwine[:,11], high_quality), 2, raw_redwine[:,11])

In [82]:
np.unique(raw_redwine[:,11], return_counts = True)

(array([0., 1., 2.]), array([  10, 1571,   18]))

In [83]:
red_wine_df = np.vstack((header_red, raw_redwine))
red_wine_df

array([['"fixed acidity"', '"volatile acidity"', '"citric acid"', ..., '"sulphates"', '"alcohol"',
        '"quality"'],
       ['2.0', '2.0', '0.0', ..., '1.0', '0.0', '1.0'],
       ['2.0', '2.0', '0.0', ..., '0.0', '0.0', '1.0'],
       ...,
       ['1.0', '0.0', '1.0', ..., '0.0', '0.0', '1.0'],
       ['1.0', '1.0', '1.0', ..., '0.0', '0.0', '1.0'],
       ['1.0', '0.0', '1.0', ..., '0.0', '0.0', '1.0']], dtype='<U32')

In [84]:
np.savetxt("Red-Wine-PreProcessed.csv", red_wine_df, fmt = '%s', delimiter = ',')

In [85]:
grade_A = np.array(['2.0'])
grade_B = np.array(['1.0'])
grade_C = np.array(['0.0'])

In [86]:
red_wine_df[:,-1] = np.where(np.isin(red_wine_df[:,-1], grade_A), 'A', red_wine_df[:,-1])

In [87]:
red_wine_df[:,-1] = np.where(np.isin(red_wine_df[:,-1], grade_B), 'B', red_wine_df[:,-1])

In [88]:
red_wine_df[:,-1] = np.where(np.isin(red_wine_df[:,-1], grade_C), 'C', red_wine_df[:,-1])

In [89]:
np.unique(red_wine_df[:,-1])

array(['"quality"', 'A', 'B', 'C'], dtype='<U32')

In [90]:
np.savetxt("Red-Wine-Processed-with-Grades.csv", red_wine_df, fmt = '%s', delimiter = ',')

In [91]:
red_wine_df_2 = pd.read_csv("Red-Wine-Processed-with-Grades.csv", delimiter = ',')
red_wine_df_2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,B
1,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,B
2,2.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
3,3.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
4,2.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
1595,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
1596,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
1597,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B


In [92]:
# since each sample in 3rd column fell under same category; 3rd column ('residual sugar') is redundant. 
# since each sample in 7th column fell under same category; 7th column ('density') is also redundant.

red_wine_df_2 = red_wine_df_2.drop(axis = 1 , columns = ['residual sugar', 'density'])
red_wine_df_2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality
0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,B
1,2.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
2,2.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
3,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
4,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...
1594,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
1595,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
1596,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
1597,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B


In [93]:
red_wine_df_2 = red_wine_df_2.rename(columns = {"fixed acidity": "Fixed Acidity", 
                                                "volatile acidity": "Volatile Acidity",
                                                "citric acid": "Citric Acid", 
                                                "chlorides": "Chlorides", 
                                                "free sulfur dioxide": "Free SO2",
                                                "total sulfur dioxide": "Total SO2", 
                                                "pH": "pH", 
                                                "sulphates": "Sulphates",
                                                "alcohol": "Alcohol", 
                                                "quality": "Quality Grade"})
red_wine_df_2

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Chlorides,Free SO2,Total SO2,pH,Sulphates,Alcohol,Quality Grade
0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,B
1,2.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,B
2,2.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
3,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
4,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...
1594,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
1595,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
1596,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B
1597,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,B


In [94]:
red_wine_df_2 = red_wine_df_2.sort_values('Quality Grade')
red_wine_df_2

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Chlorides,Free SO2,Total SO2,pH,Sulphates,Alcohol,Quality Grade
828,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,A
481,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,A
455,3.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,A
440,4.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,A
278,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,A
...,...,...,...,...,...,...,...,...,...,...
1478,2.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,C
1374,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,C
832,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,C
690,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,C


In [95]:
red_wine_df_2.to_csv("Red-Wine-Preprocessed-Version-2.csv", sep = ',', index = False)

In [96]:
# fractioning the main table based on gquality classification

redwine_quality_A = red_wine_df_2[red_wine_df_2['Quality Grade'] == 'A']
redwine_quality_B = red_wine_df_2[red_wine_df_2['Quality Grade'] == 'B']
redwine_quality_C = red_wine_df_2[red_wine_df_2['Quality Grade'] == 'C']

In [97]:
# verifying that newly created tables (see above) are structurally consistent with the main table

redwine_quality_A.shape[0] + redwine_quality_B.shape[0] + redwine_quality_C.shape[0]

1599

In [98]:
redwine_quality_A.to_csv("A-Grade-Red-Wine-Preprocessed.csv", sep = ',', index = False)
redwine_quality_B.to_csv("B-Grade-Red-Wine-Preprocessed.csv", sep = ',', index = False)
redwine_quality_C.to_csv("C-Grade-Red-Wine-Preprocessed.csv", sep = ',', index = False)

In [99]:
raw_redwine_2 = np.genfromtxt("winequality-red.csv", delimiter = ';', skip_header = 1)
raw_redwine_2

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

In [100]:
raw_redwine_2[:,-1] = np.where(np.isin(raw_redwine_2[:,-1], low_quality), 0, raw_redwine_2[:,-1])

In [101]:
raw_redwine_2[:,-1] = np.where(np.isin(raw_redwine_2[:,-1], avg_quality), 1, raw_redwine_2[:,-1])

In [102]:
raw_redwine_2[:,-1] = np.where(np.isin(raw_redwine_2[:,-1], high_quality), 2, raw_redwine_2[:,-1])

In [103]:
np.unique(raw_redwine_2[:,-1])

array([0., 1., 2.])

In [104]:
raw_redwine_df = np.vstack((header_red, raw_redwine_2))
raw_redwine_df

array([['"fixed acidity"', '"volatile acidity"', '"citric acid"', ..., '"sulphates"', '"alcohol"',
        '"quality"'],
       ['7.4', '0.7', '0.0', ..., '0.56', '9.4', '1.0'],
       ['7.8', '0.88', '0.0', ..., '0.68', '9.8', '1.0'],
       ...,
       ['6.3', '0.51', '0.13', ..., '0.75', '11.0', '1.0'],
       ['5.9', '0.645', '0.12', ..., '0.71', '10.2', '1.0'],
       ['6.0', '0.31', '0.47', ..., '0.66', '11.0', '1.0']], dtype='<U32')

In [105]:
raw_redwine_df[:,-1] = np.where(np.isin(raw_redwine_df[:,-1], grade_A), 'A', raw_redwine_df[:,-1])

In [106]:
raw_redwine_df[:,-1] = np.where(np.isin(raw_redwine_df[:,-1], grade_B), 'B', raw_redwine_df[:,-1])

In [107]:
raw_redwine_df[:,-1] = np.where(np.isin(raw_redwine_df[:,-1], grade_C), 'C', raw_redwine_df[:,-1])

In [108]:
np.unique(raw_redwine_df[:,-1])

array(['"quality"', 'A', 'B', 'C'], dtype='<U32')

In [109]:
np.savetxt("Raw-Red-Wine-w-Grades.csv", raw_redwine_df, fmt = '%s', delimiter = ',')