In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
independent_vars = []
for col in df.columns:
    if col != 'quality':
        independent_vars.append(col)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

plt.subplots(figsize = (10,10))
sns.heatmap(corr, mask = mask, cmap = 'Greys', annot = True)
plt.show()

Overall, there are no independent variables that immediately stood out in correlation with wine quality. All have weak correlation with wine quality.  

* Alcohol content has a weak positive correlation with wine quality
* Voltaile acidity has a weak negative correlation with wine quality

The following visualization is inspired by [this kernel](https://www.kaggle.com/swapnils007/red-wine-quality).

In [None]:
fig, ax1 = plt.subplots(3,4, figsize=(20, 10))

independent_cols = list(independent_vars)

col = 0

for i in range(3):
    for j in range(4):
        sns.barplot(df['quality'], df[independent_cols[col]], ax = ax1[i][j])
        col += 1
        
plt.show()



Weak negative correlation with quality:
* Volatile acidity
* Chlorides

Weak positive correlation with quality:
* Sulphates
* Citric Acid

# The Problem of Imbalanced Dataset

We have an imbalanced dataset in which only 14% of the data is classified as "good" or 1. 

This means that even when our classification model outputs bad or "0" all the time, we will still get 86% accuracy! Good accuracy rate, but still a pretty worthless model.

In [None]:
#mapping wine quality. "good" or 1 for quality of 7 and above, everything else is "bad" or 0
df['quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)

#we have an imbalanced class. only 14% of data is of "good" quality wine
df['quality'].value_counts(normalize = True)

We have a few ways of dealing with an imbalanced dataset:

# 1. Random Undersampling / Oversampling a.k.a "naive resampling"
* Random undersampling: randomly deleting entries in the majority dataset, with replacement.
* Random oversampling: randomly duplicating entries in the minority dataset, with replacement.
Pros:
This is the easiest and fastest way to balance our dataset. 

Cons:
* Oversampling our data might cause our model to overfit
* Undersampling might cause some loss of information to our already small dataset

# 2. Adjusting class weights in our model
By adjusting class weights, we are penalizing our model more severely when it wrongly classifies a minority class, and more leniently when it misclassifies a majority class. The minority class will have a larger weight, and vice versa.

The formula to determine class weights is the following:

> minority class weight = total number of samples in the dataset / (total unique classes * total sample of the minority)

and the majority class weight would be the inverse of that, or 1 - minority class weight. 

To put it concretely, let's say we use a Logistic Regression model using Binary Cross-Entropy for our loss function, given by:

> Loss = −ylog(p) − (1−y)log(1−p)
 
Let's say we set our class weights for 0 as 1, and 1 as 20, so our loss function becomes:

> Loss = −(1)ylog(p) − (20)(1−y)log(1−p)

Plugging in our p with, say, 0.3 gives us:
> Loss = 20 log(0.7) = 3.09

Which is a much bigger penalty compared to when it's only log(0.7). A larger weight results in a larger penalty, which in turn results in a larger coefficient update. In a way, we are incentivizing our model to favor correct predictions for the minority class.  


# 3. Changing our evaluation metric
This is the simplest way, as well as the easiest method to explain.

An F-1 Score is a better evaluation metric for our model, since it takes into account Precision and Recall.
* Precision: out of all that we predict as positive, what proportion do we classify correctly?
* Recall: out of all the actual positives, what proportion do we classify correctly?

Note that precision and recall is a trade-off. To simplify our thinking process, it's helpful to have a single score as our evaluation metric, that is, the F-1 Score.

![https://www.oreilly.com/library/view/hands-on-recommendation-systems/9781788993753/assets/dcd94ad1-96f6-4e27-84c9-d6f42e1efee2.png](https://www.oreilly.com/library/view/hands-on-recommendation-systems/9781788993753/assets/dcd94ad1-96f6-4e27-84c9-d6f42e1efee2.png)

The F-1 score is actually a harmonic mean of both precision and recall. 

# How will we evaluate our model?

We will use F1 score to gauge our model's performance. For all our model, we will print a classification report and use the Macro F1 score as our guiding metric.

Taken from [StackExchange](https://datascience.stackexchange.com/questions/65839/macro-average-and-weighted-average-meaning-in-classification-report#:~:text=macro%2Davg%20is%20mean%20average,of%20objects%20in%20all%20classes.):

> Macro F1 calculates the F1 separated by class but not using weights for the aggregation:
𝐹1𝑐𝑙𝑎𝑠𝑠1+𝐹1𝑐𝑙𝑎𝑠𝑠2+⋅⋅⋅+𝐹1𝑐𝑙𝑎𝑠𝑠𝑁
which resuls in a bigger penalisation when your model does not perform well with the minority classes(which is exactly what you want when there is imbalance)
> 
> Weighted F1 score calculates the F1 score for each class independently but when it adds them together uses a weight that depends on the number of true labels of each class:
𝐹1𝑐𝑙𝑎𝑠𝑠1∗𝑊1+𝐹1𝑐𝑙𝑎𝑠𝑠2∗𝑊2+⋅⋅⋅+𝐹1𝑐𝑙𝑎𝑠𝑠𝑁∗𝑊𝑁
therefore favouring the majority class (which is want you usually dont want)

# Modelling

We'll try out the following models for this classification problem:
- Logistic Regression
- Support Vector Machine
- Random Forest
- Decision Trees
- Naive Bayes

In [None]:
#preparing our train and test
y = df['quality']
X = df[independent_vars]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
#scaling our data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state = 20)
logreg.fit(X_train, y_train)
y_val = logreg.predict(X_test)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

print(classification_report(y_test, y_val))

# Support Vector Machine

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

svm = svm.SVC()
svm.fit(X_train, y_train)
y_val = svm.predict(X_test)

In [None]:
print(classification_report(y_test, y_val))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 20)
rf.fit(X_train, y_train)
y_val = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_val))

Our best model so far! Giving us a macro average F1 score of 0.75.

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_val = gnb.predict(X_test)

print(classification_report(y_test, y_val))

# Decision Tree

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=20)
clf.fit(X_train, y_train)
y_val = clf.predict(X_test)

print(classification_report(y_test, y_val))