# Wine Quality Classification

The notebook is intended to develop & validate a model for multi-class classification of the Wine Quality.

In [None]:
# Import Standard Libraries
import pandas as pd
import os
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import zscore

In [None]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

In [None]:
# Notebook's variables
train_data_path = os.path.join('./../../data/S3E5/wine_quality_train.csv')
test_data_path = os.path.join('./../../data/S3E5/wine_quality_test.csv')

# Read Data

In [None]:
# Read train and test data
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
test_data.info()

# Exploratory Data Analysis

## Train Feature & Label Distribution

In [None]:
# Plot the histograms of each feature
figure, ax = plt.subplots(3, 4, figsize=(16, 9))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' column)
for index, column_name in enumerate(train_data.columns[1:]):
    
    # Plot data
    sns.histplot(data=train_data[column_name], 
                 ax=ax[index])
    
    ax[index].set_title(column_name, 
                        fontsize=14, 
                        fontweight='bold')
    
    ax[index].tick_params(labelrotation=45)
    
plt.suptitle('Feature & Label Distrubtion', 
             fontsize=20)
    
plt.tight_layout()

The following features have a skewed distribution:
- `fixed acidity`
- `citric acid`
- `residual sugar`
- `free sulfur dioxide`
- `total sulfur dioxide`
- `sulphates`
- `alcohol`

It would be useful to use the Z-Score Outliers Filter.
<br>

In addition, it is possible to see that the label classes 3, 4 and 8 do not have a lots of records. That is an imbalanced data problem. Consider to use a Stratified K-Fold during the training of the model.

## Train Feature Distribution per Label

In [None]:
# Plot the box plot of each feature per label
figure, ax = plt.subplots(3, 4, figsize=(16, 12))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' column and 'quality' column)
for index, column_name in enumerate(train_data.columns[1:-1]):
    
    # Plot data
    sns.boxplot(data=train_data,
                x='quality',
                y=column_name,
                ax=ax[index])
    
        
# Remove the empty subplot
figure.delaxes(ax[-1])

# Set title plot
plt.suptitle('Feature Distrubtion per Label', 
             fontsize=20, 
             fontweight='bold')

plt.tight_layout()

plt.show()

There is a positive non-linear relationship between the following features and the `Quality`:
- `sulphates`
- `alcohol`

Thre is a negative non-linear relationship between the following features the the `Quality`:
- `voltatile acidity`
- `density`

## Pearson Correlation

In [None]:
# Compute the correlation matrix
correlation_matrix = train_data.iloc[:, 1:].corr()

In [None]:
# Generate a mask for the upper triangle
correlation_mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

In [None]:
# Define figure and axis
figure, ax = plt.subplots(figsize=(12, 8))

# Plot the correlation matrix
sns.heatmap(correlation_matrix, 
            mask=correlation_mask, 
            cmap='mako',
            vmax=1.0, 
            vmin=-1.0, 
            center=0, 
            square=True, 
            linewidths=.5, 
            annot=True,
            annot_kws={'fontsize': 8},
            cbar_kws={"shrink":.8, 'orientation':'vertical'})

# Set title
ax.set_title('Pearson Correlation', 
             fontsize=20, 
             fontweight='bold')

plt.tight_layout()

plt.show()

The following features show a significant positive correlation:
- `citric acid` and `fixed acidity`
- `density` and `fixed acidity`
- `total sulfur dioxide` and `free sulfur dioxide`

The following features show a significant negative correlation:
- `citric acid` and `volatile acidity`
- `pH` and `fixed acidity`

## Train vs Test Feature & Label Distribution

In [None]:
# Plot the KDE of each feature
figure, ax = plt.subplots(3, 4, figsize=(16, 12))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' and 'quality' columns)
for index, column_name in enumerate(train_data.columns[1:-1]):
    
    # Plot data
    sns.kdeplot(data=train_data[column_name],
                label='Train',
                ax=ax[index])
    
    sns.kdeplot(data=test_data[column_name],
                label='Test',
                ax=ax[index])
    
    ax[index].set_title(column_name, fontsize=14)
    
    ax[index].tick_params(labelrotation=45)
    
    # Retrieve legend information
    handles = ax[index].get_legend_handles_labels()[0]
    labels = ax[index].get_legend_handles_labels()[1]
    ax[index].legend().remove()
    
# Remove the empty subplot
figure.delaxes(ax[-1])

# Set the legend
figure.legend(handles, 
              labels, 
              loc='upper center', 
              bbox_to_anchor=(0.5, 1.03), 
              fontsize=12,
              ncol=2)

plt.tight_layout()

There are no strong differences between the feature distribution of the train set and the test set.

## Count Outliers with the Z-Score across Quality Classes

In [None]:
# Compute the Z-Score for the feature columns across 'quality' classes
z_scores = train_data.iloc[:, 1:-1].groupby(train_data['quality'], 
                                            group_keys=True).apply(zscore)

In [None]:
# Consider as an 'outlier' every records with a Z-Score bigger than 2 SDs in absolute value terms
outliers = z_scores.abs().ge(2).groupby(z_scores.index.get_level_values(0)).sum()

In [None]:
# Plot outlisers per feature across 'quality' classes
figure, ax = plt.subplots(3, 4, figsize=(16, 9))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' and 'quality' columns)
for index, column_name in enumerate(outliers.columns):
    
    # Plot data
    sns.barplot(data=outliers,
                x=outliers.index,
                y=column_name,
                ax=ax[index])
    
    ax[index].set_title(column_name, fontsize=14)
    
    ax[index].tick_params(labelrotation=45)
    
# Remove the empty subplot
figure.delaxes(ax[-1])

# Set title plot
plt.suptitle('Outliers Count', 
             fontsize=20, 
             fontweight='bold')
    
plt.tight_layout()

The quality classes 5, 6 and 7 show the highest amount of outliers.

## Train Feature Pairplot

In [None]:
# Plot the Pairplot between the features
sns.pairplot(train_data.drop(columns=['Id', 'quality']),
             kind="reg",
             diag_kind='kde',
             plot_kws={'line_kws':{'color':'red'}},
             corner=True)

# Set title plot
plt.suptitle('Train Feature Pairplot', 
             fontsize=20, 
             fontweight='bold')

plt.tight_layout()

plt.show()

## Conclusions
- Several numerical features present a right-skew distribution -> Use of a StandardScalre
- `quality` targe

# Data Preparation