#### Import Dependancies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Read dataset

In [None]:
df = pd.read_csv('WineQuality.csv')
df.head()

#### Sorting out values according to 1st column

In [None]:
df.sort_values("Unnamed: 0")

Seems that the 1st column is a sort of batching of how the data has been collected. Since the 1st column doesn't really affect the dataset, the whole column can be dropped.

In [None]:
df = df.drop("Unnamed: 0", axis=1)
df.head()

#### Checking for any null values in the dataset


In [None]:
df.isnull().sum()

checking for each column data type

#### Checking for statistical values of the dataset

In [None]:
df.describe()

In [None]:
df.info()

#### Wine type distribution

#### There is only 1 column that has categorical values, which is the "Type" column. This column will be converted to numerical values using the LabelEncoder

In [None]:
#Labelencoding the categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Type'] = le.fit_transform(df['Type']) 


df.head()


So there are only 2 types of wine types and each of the wine types having non numerical values will be converted into numerical values.i.e 0 and 1. O for red ad 1 for white. Now that the wine type is now converted into 0 and 1 this can be taken as a binary classification problem.

#### Data and targets are separated into 2 different variables

In [None]:
data = df.drop('Type', axis=1)
target = df['Type']

It is good to check for class imblanaces in the dataset because this might affect the performacne of the model. If there is a class imbalance, the model will be biased towards the majority class.

In [None]:
#counting the number of 0s and 1s and saving them into 2 variables
count_0 = len(df[df['Type']==0])
count_1 = len(df[df['Type']==1])


if count_0 > count_1:
    ratio = count_0/count_1
else:
    ratio = count_1/count_0

ratio

In [None]:
sns.catplot(x='Type', data=df, kind='count')

In this case, there is a class imbalance. The white wine class is the majority class and the red wine class is the minority class. This can be seen in the bar graph below.

#### SMOTE Implementation for class imbalance

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
sampled_data, sampled_targets = smote.fit_resample(data, target)

In [None]:
#after sampling plotting 0s and 1s
sns.catplot(x='Type', data=sampled_targets, kind='count')

#### Scaling the data using MinMax Scaler

In [None]:
#normalizing the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(sampled_data)


In [None]:
#spliting the dataset into train and test and validation
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(sampled_data, sampled_targets, test_size=0.2, random_state=90)

As stated above, now that this can be taken as a binary classification problem, logistic regression can be used to train the model.

In [None]:
#using logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)
log_reg.fit(x_train, y_train)

y_pred = log_reg.predict(x_test)




## Using Decision Tree Classifier


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

y_pred_dt = dt.predict(x_test)
y_pred_dt

#accuracy score
accuracy_score(y_test, y_pred_dt)

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat_dt = confusion_matrix(y_test, y_pred_dt)
conf_mat_dt


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay 
disp_dt = ConfusionMatrixDisplay(confusion_matrix=conf_mat_dt, display_labels=['Red Wine','White Wine'])
disp_dt.plot()


In [None]:
#classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_dt))


## Using Random Forest

In [None]:
#using random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train, y_train)

y_pred_rf = rf.predict(x_test)

#accuracy score
accuracy_score(y_test, y_pred_rf)



#### Confusion Matric for random forest classifier


In [None]:
conf_mat_rf = confusion_matrix(y_test, y_pred_rf)
conf_mat_rf


In [None]:
disp_rf = ConfusionMatrixDisplay(confusion_matrix=conf_mat_rf, display_labels=['Red Wine','White Wine'])
disp_rf.plot()


In [None]:
print(classification_report(y_test, y_pred_rf))


## Using KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

y_pred_knn = knn.predict(x_test)

accuracy_score(y_test, y_pred_knn)


In [None]:
conf_mat_knn = confusion_matrix(y_test, y_pred_knn)
conf_mat_knn

In [None]:
disp_knn = ConfusionMatrixDisplay(confusion_matrix=conf_mat_knn, display_labels=['Red Wine','White Wine'])
disp_knn.plot()
