In [82]:
import pandas as pd
import numpy as np

# Visualize
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

# Preprocessing
from sklearn import model_selection as ms
from imblearn.over_sampling import ADASYN
from sklearn import preprocessing
from sklearn.decomposition import PCA

# Metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, classification_report

# Models
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Data Import
df = pd.read_csv("winequality-red.csv")

In [None]:
df.describe()

#
**Contextualization**

- Measurements of 11 physical-chemical variables that characterize each sample (the features of the problem):
<br>
  - 1 - fixed acidity - measurement of acidity due to the presence of low-volatility organic acids (malic, lactic, tartaric or citric acid);
  - 2 - volatile acidity - measurement of acidity due to the presence of low molecular weight acids (mainly acetic acid), which are responsible for the vinegar aroma and taste;
  - 3 - citric acid - measurement of citric acid;
  - 4 - residual sugar - measurement of residual sugar present, originating from the sugar residues of the grape that remain after the end of fermentation;
  - 5 - chlorides - measurement of chlorides (chlorine ions);
  - 6 - free sulfur dioxide - measurement of free sulfur dioxide (i.e., that which is not bound to other molecules); - 7 - total sulfur dioxide - measure of total sulfur dioxide (free + portion bound to other molecules);
  - 8 - density - measure of the density of the wine;
  - 9 - pH - measure of the pH of the wine;
  - 10 - sulphates - measure of sulfates (SO₄²⁻ ions);
  - 11 - alcohol - measure of the alcoholic strength.
  - 12 - quality - numerical quality score (from 3 to 8), produced based on sensory data.

#
1. The main descriptive statistics of position (mean, median, quartiles etc.) and dispersion (std, IQR etc.)

In [7]:
df_desc = df.describe()

In [8]:
df_desc.loc["IQR"] = df_desc.loc["75%"] - df_desc.loc["25%"]

In [None]:
X = df.drop(columns="quality")
Y = df["quality"]

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X, Y)

importances = clf.feature_importances_

feature_importance_df = pd.DataFrame({
  'Feature': X.columns,
  'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df

#
2. Presence of outliers

In [None]:
for col in df:
  plt.figure(figsize=(4,4))
  sns.histplot(data=df, x=col, kde=True).set_title(f"Variable distribution: {col}")
  plt.show()

In [None]:
## Método do Quartil.

for col in df.drop(columns='quality'):
    
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 - Q1
  
  aux_outliers = df[
    (df[col] < Q1-(IQR*1.5)) |
    (df[col] > Q3+(IQR*1.5))
  ]
  
  indices_outliers = aux_outliers.index.tolist()
  
  if len(indices_outliers) >= 1:
      
    print(f"The column {col} has {len(indices_outliers)} outliers!")
    print("\nIts indexes are:\n")
    print(indices_outliers)

  else:
      
    print(f"The column {col} has not outliers!")
      
  print()
  print("="*80)
  print()

#
3. Bar graph of the 90% confidence interval for the mean of each of the physical-chemical variables, grouped by the categorical levels of the response variable quality.

In [None]:
df["fixed acidity"].mean()

In [None]:
# 
for col in df.drop(columns="quality"):
  sns.barplot(data=df, x="quality", y=col, errorbar=("ci", 90), hue="quality")
  plt.show()

In [27]:
df['quality_bin'] = df['quality'].apply(lambda x : "bom" if x > 5 else "ruim")

In [None]:
df_bin = df.drop(columns=['quality'])
df_bin

In [None]:
for col in df_bin.drop(columns='quality_bin'):
  sns.histplot(
    data=df_bin, x=col, kde=True, hue=df_bin['quality_bin']
    ).set_title(f"Distribuição da variável {col}")
  plt.show()

#
4. Machine Learning

In [None]:
# Correlation plot
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True)
plt.title(
  "Correlation".upper(),
  fontdict={'family':'calibri', 'fontsize':18, 'weight':'bold', 'color':'red'},
  pad=20
)
plt.show()

In [None]:
print("Correlation with quality column:".center(50).upper())
print()
for col in df.drop(columns="quality").columns:
  print(f"{col}: {df["quality"].corr(df[col]):.4f}")

In [None]:
adasyn = ADASYN(
  sampling_strategy='minority',
  random_state=42,
  n_neighbors=5
)

# Division between independent and dependent variables
X = df.drop(columns=["quality"])
y = df["quality"].values

# Cubic Root to reduce outliers. Presents a similar result to when using logarithm
X = X.map(lambda x: x**(1/3))

# Robust to outliers
scaler = preprocessing.RobustScaler()
X = scaler.fit_transform(X)

# Oversampling using ADASYN, shows a better result than SMOTE
X, y = adasyn.fit_resample(X, y)

# Reduce to 10 features. Shows better result
pca = PCA(n_components=10, random_state=42)
X = pca.fit_transform(X)

# Build Random Forest Model
rf = RandomForestClassifier(random_state=42)

# Hyperparameters to iterate
param_grid = [
  {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
  }
]

# Split train and test data
X_train, X_test, y_train, y_test = ms.train_test_split(
X, y, test_size=0.2, random_state=42
)

# Execute GridSearch and Train Model
grid_search = ms.GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [97]:
# Predict Values
y_pred = grid_search.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Build confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
# Evaluating the Model
print("Model Accuracy:")
print(f"{round(accuracy * 100,2)}%")
print()
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Plot Confusion Matrix to evaluate model
font_ticklabels = FontProperties(
  family='calibri', size=12, weight='bold')

plt.figure(figsize=(10, 7))
ax_rf = sns.heatmap(
  conf_matrix,
  annot=True,
  fmt='d',
  cmap='Blues',
  xticklabels=set(y_train),
  yticklabels=set(y_train),
  cbar=False # Remove color bar
)

plt.xticks(fontproperties=font_ticklabels)
plt.yticks(fontproperties=font_ticklabels)

plt.xlabel('Predicted', fontdict={'fontsize':14}, labelpad=20)
plt.ylabel('Test', fontdict={'fontsize':14}, labelpad=20)
plt.title('Confusion Matrix', fontdict={'fontsize':16})
plt.show();

In [None]:
# Visualize curve
for col in pd.DataFrame(X):
  sns.kdeplot(data=pd.DataFrame(X), x=col).set_title(f"Variable distribution: {col}")
  plt.show()

In [None]:
#Verificação de contaminação após transformação cúbica

# Valores de contaminação a serem testados
contamination_values = [0.001, 0.01, 0.05, 0.1]

# Avaliando o desempenho do modelo para cada valor de contaminação
for classe in set(df["quality"].unique()):
  print(f"Class: {classe}")
  for contamination in contamination_values:
    model = IsolationForest(contamination=contamination, random_state=42)
    model.fit(X_train[y_train == classe])
    y_pred = model.predict(X_test[y_test == classe])
    precision = precision_score(y_test[y_test == classe], y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test[y_test == classe], y_pred, average='macro', zero_division=0)
    print(f'Contamination: {contamination}, Precision: {precision}, Recall: {recall}')

#
5. Quality reduced to 'Bad' or 'Good'

In [46]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

##
Analysis

In [4]:
df2 = df.copy()
df2["quality"] = df["quality"].apply(lambda x: 0 if x <= 5 else 1)

In [None]:
# Percent of each quality
df2["quality"].value_counts(normalize=True)*100

In [None]:
# Information Gain
X = df2.drop(columns="quality")
Y = df2["quality"]

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X, Y)

importances = clf.feature_importances_

feature_importance_df = pd.DataFrame({
  'Feature': X.columns,
  'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df

In [None]:
# Correlation plot
plt.figure(figsize=(12, 6))
sns.heatmap(df2.corr(), annot=True)
plt.title(
  "Correlation".upper(),
  fontdict={'family':'calibri', 'fontsize':18, 'weight':'bold', 'color':'red'},
  pad=20
)
plt.show()

##
Model

In [None]:
# Division between independent and dependent variables
X = df2.drop(columns=["quality"])
y = df2["quality"].values

# Cubic Root to reduce outliers.
X = X.map(lambda x: x**(1/3))

scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

# Reduce to 10 features. Shows better result
pca = PCA(n_components=10, random_state=42)
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.2,
  random_state=42
)

rf = RandomForestClassifier(random_state=42)

param_grid = {
  'n_estimators': [50, 100, 200],
  'max_depth': [None, 10, 20],
  'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)

# Treina o Stacking Classifier
grid_search.fit(X_train, y_train)

In [80]:
# Predict Values
y_pred = grid_search.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Build confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
# Evaluating the Model
print("Model Accuracy:")
print(f"{round(accuracy * 100,2)}%")
print()
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Plot Confusion Matrix to evaluate model
font_ticklabels = FontProperties(
  family='calibri', size=12, weight='bold')

plt.figure(figsize=(10, 7))
ax_rf = sns.heatmap(
  conf_matrix,
  annot=True,
  fmt='d',
  cmap='Blues',
  xticklabels=set(y_train),
  yticklabels=set(y_train),
  cbar=False # Remove color bar
)

plt.xticks(fontproperties=font_ticklabels)
plt.yticks(fontproperties=font_ticklabels)

plt.xlabel('Predicted', fontdict={'fontsize':14}, labelpad=20)
plt.ylabel('Test', fontdict={'fontsize':14}, labelpad=20)
plt.title('Confusion Matrix', fontdict={'fontsize':16})
plt.show();