In [None]:
# Loading important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('fivethirtyeight')

In [None]:
# Loading data
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head(10)

# EDA and Visualizations

In [None]:
df.describe().T

In [None]:
sns.countplot(x='quality', data=df)
plt.show()

Unequal distribution for target variable **'quality'**.

In [None]:
cols = list(df.columns)
fig, ax = plt.subplots(3,4, figsize=(24,18))

for i in range(11):
    j = i // 4
    k = i % 4
    sns.boxplot(y=cols[i], x = 'quality', data=df, ax = ax[j][k])
    
plt.show()

Almost all the features have many outliers in their distribution.

In [None]:
corr = df.corr()
plt.figure(figsize=(12,12))
sns.heatmap(corr, cmap='YlOrRd', annot=True)
plt.show()

There are significant correlations of features with quality. We will visit it again after feature engg.

In [None]:
fig, ax = plt.subplots(3,4, figsize=(24,18))

for i in range(12):
    j = i//4
    k = i%4
    sns.histplot(x=cols[i], data=df, ax=ax[j][k], kde=True)

residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, sulphates are heavily skewed, applying log transform on them.

# Feature Engineering

## Log scaling skewed features

In [None]:
skew_cols = ['residual sugar', 'chlorides','free sulfur dioxide','total sulfur dioxide','sulphates']

for col in skew_cols:
    df[col] = df[col].apply(lambda x: np.log(x))

In [None]:
fig, ax = plt.subplots(3,4, figsize=(24,18))

for i in range(12):
    j = i//4
    k = i%4
    sns.histplot(x=cols[i], data=df, ax=ax[j][k], kde=True, color='green')

## Resampling due to uneven class distribution

In [None]:
df_3 = df[df.quality==3]
df_4 = df[df.quality==4]
df_5 = df[df.quality==5]
df_6 = df[df.quality==6]
df_7 = df[df.quality==7]
df_8 = df[df.quality==8]

In [None]:
from sklearn.utils import resample

df_3_upsampled = resample(df_3, replace=True, n_samples=400, random_state=42)
df_4_upsampled = resample(df_4, replace=True, n_samples=400, random_state=42)
df_7_upsampled = resample(df_7, replace=True, n_samples=400, random_state=42)
df_8_upsampled = resample(df_8, replace=True, n_samples=400, random_state=42)

df_5_downsampled = df_5.sample(n=400).reset_index(drop=True)
df_6_downsampled = df_6.sample(n=400).reset_index(drop=True)

In [None]:
df_resampled = pd.concat([df_3_upsampled, df_4_upsampled, df_7_upsampled, df_8_upsampled, 
                          df_5_downsampled, df_6_downsampled]).reset_index(drop=True)
df_resampled.quality.value_counts().sort_index()

In [None]:
sns.countplot(x='quality', data=df_resampled)
plt.show()

Equal sample size for all classes.

In [None]:
corr_2 = df_resampled.corr()
plt.figure(figsize=(12,12))
sns.heatmap(corr_2, cmap='YlOrRd', annot=True)
plt.show()

In [None]:
corr_2.loc[(corr_2.quality >= 0.05) | (corr_2.quality <= -0.05), 'quality']

10 features have significant correlation, thus we will drop other columns while modeling.

In [None]:
X = df_resampled.drop(['residual sugar', 'quality'], axis=1)
y = df_resampled['quality']

# Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state = 42)
score = cross_val_score(model, X, y, cv=5)
print('Initial Score for DT classifier: ', score, '\nMean score: ', score.mean())

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = DecisionTreeClassifier(random_state=42)
parameters = {'max_depth':[5,10,15,20], 'max_features' : ['auto','sqrt','log2']}
cv = GridSearchCV(model, parameters, cv=5)
cv.fit(X, y)

In [None]:
cv.best_score_

83.7% accuracy after grid search.

In [None]:
cv.best_estimator_

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(random_state=42)

score = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print('Initial score for RF: ', score, '\nMean Score: ', score.mean())

I used a grid search to find optimal max_depth=15. It takes time to run, thus I will only show the result

In [None]:
model = RandomForestClassifier(random_state=42, max_depth=15)

score = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print('Improved score for RF: ', score, '\nMean Score: ', score.mean())

Final result **87.13%** accuracy score. There is still a lot of scope for adjustments in modeling to improve the performance.

# References

Feature Engineering and Resampling:
- **Red Wine 🍷 Quality Assesment 📊 | Starter Pack**  : https://www.kaggle.com/aditimulye/red-wine-quality-assesment-starter-pack