# Wine Quality Classification

The notebook is intended to develop & validate a model for multi-class classification of the Wine Quality.

In [None]:
# Import Standard Libraries
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

In [None]:
# Notebook's variables
train_data_path = os.path.join('./../../data/S3E5/wine_quality_train.csv')
test_data_path = os.path.join('./../../data/S3E5/wine_quality_test.csv')

# Read Data

In [None]:
# Read train and test data
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
test_data.info()

# Exploratory Data Analysis

## Feature Distribution

In [None]:
# Plot the histograms of each feature
figure, ax = plt.subplots(3, 4, figsize=(16, 9))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' column)
for index, column_name in enumerate(train_data.columns[1:]):
    
    # Plot data
    sns.histplot(data=train_data[column_name], 
                 ax=ax[index])
    
    ax[index].set_title(column_name, fontsize=14)
    
    ax[index].tick_params(labelrotation=45)
    
plt.tight_layout()

The following features have a skewed distribution:
- `fixed acidity`
- `citric acid`
- `residual sugar`
- `free sulfur dioxide`
- `total sulfur dioxide`
- `sulphates`
- `alcohol`

## Feature Distribution per Label

In [None]:
# Plot the box plot of each feature per label
figure, ax = plt.subplots(3, 4, figsize=(16, 9))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' column and 'quality' column)
for index, column_name in enumerate(train_data.columns[1:-1]):
    
    # Plot data
    sns.boxplot(data=train_data,
                x='quality',
                y=column_name,
                ax=ax[index])
    
        
# Remove the empty subplot
figure.delaxes(ax[-1])

plt.suptitle('Feature Distrubtion per Label', fontsize=14)
plt.tight_layout()
plt.show()