**Importing libraries and data**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, plot_confusion_matrix, f1_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

**Data Exploration**

In [None]:
print("First 5 rows in df:")
print(df.head())
print("\n\n")

print("Checking if there is Null:")
print(df.isnull().sum())
print("\n\n")

print("df describing:")
print(df.describe())
print("\n\n")

print("Checking duplicates:")
print(df.duplicated().sum())

# Removing duplicates
df=df.drop_duplicates(keep='first')

**EDA**

In [None]:
# Scatter plots of each pair of features
sns.pairplot(data = df)
plt.show()

In [None]:
# Looking for outliers
plt.figure(figsize=(40,20))
sns.boxplot(data=df)

In [None]:
# Removing outliers by Tukey’s box plot method
# For more information go to https://towardsdatascience.com/detecting-and-treating-outliers-in-python-part-1-4ece5098b755
q1=df.quantile(0.25)
q2=df.quantile(0.75)
IQR=q2-q1

print("Number of outliers is:")
print(df[((df<(q1-1.5*IQR))|(df>(q2+1.5*IQR))).any(axis=1)].shape[0])      



df=df[~((df<(q1-1.5*IQR))|(df>(q2+1.5*IQR))).any(axis=1)]

# Visualizing data without outliers
plt.figure(figsize=(40,20))
sns.boxplot(data=df)


print("New df shape:")
print(df.shape)

In [None]:
# Let`s see the correlation between all variables
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

**Building a model REGRESSIONS**

In [None]:
# Splitting data into X, y - features and predictible variable
X = df.loc[:,df.columns != 'quality']
y = df['quality']


# Replacing quality column by new column with only two classes: 1 - good wine, 0 - bad wine
df['easy_quality'] = 0
df.loc[df['quality']>=7,'easy_quality'] = 1
y = df['easy_quality']



# Splitting X, y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12345)


*In this competition classes are imbalanced, so we should look at F1 score (for class 1 in classification report) - not at accuracy*
More about scores:
https://towardsdatascience.com/beyond-accuracy-precision-and-recall-3da06bea9f6c

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification report for Naive Bayes Classifier:")
print(classification_report(y_test,y_pred))
plt.figure(figsize=(12,10))
plot_confusion_matrix(clf, X_test,y_test)
print("F1 Score is: " + str(f1_score(y_pred, y_test)))
print("\n\n")

In [None]:
# Choosing independent features by wrapper method (about it you can read by url down)
# https://towardsdatascience.com/feature-selection-with-pandas-e3690ad8504b

import statsmodels.api as sm

# First iteration, choose where p_value is less 0.05
X_1 = sm.add_constant(X)
model = sm.OLS(y,X_1).fit()
print(model.pvalues)


# Second iteration, choose where p_value is less 0.05
X_1 = sm.add_constant(df.loc[:,['fixed acidity', 'volatile acidity','free sulfur dioxide', 'density', 'sulphates', 'alcohol']])
model = sm.OLS(y,X_1).fit()
print(model.pvalues)
# X = df.loc[:,['fixed acidity', 'volatile acidity','free sulfur dioxide', 'density', 'sulphates', 'alcohol']]


X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.25, random_state=12345)

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification report for Naive Bayes Classifier:")
print(classification_report(y_test,y_pred))
plt.figure(figsize=(12,10))
plot_confusion_matrix(clf, X_test,y_test)
print("F1 Score is: " + str(f1_score(y_pred, y_test)))
print("\n\n")

In [None]:
# Let's try Logistic regression with weighted classes. More information by the link below:
# https://towardsdatascience.com/weighted-logistic-regression-for-imbalanced-dataset-9a5cd88e68b
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12345)

X_1 = sm.add_constant(df.loc[:,['fixed acidity', 'volatile acidity','free sulfur dioxide', 'density', 'sulphates', 'alcohol']])
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.25, random_state=12345)

w = [{0:1000,1:100},{0:1000,1:10}, {0:1000,1:1.0}, 
     {0:500,1:1.0}, {0:400,1:1.0}, {0:300,1:1.0}, {0:200,1:1.0}, 
     {0:150,1:1.0}, {0:100,1:1.0}, {0:99,1:1.0}, {0:10,1:1.0}, 
     {0:0.01,1:1.0}, {0:0.01,1:10}, {0:0.01,1:100}, 
     {0:0.001,1:1.0}, {0:0.005,1:1.0}, {0:1.0,1:1.0}, 
     {0:1.0,1:0.1}, {0:10,1:0.1}, {0:100,1:0.1}, 
     {0:10,1:0.01}, {0:1.0,1:0.01}, {0:1.0,1:0.001}, {0:1.0,1:0.005}, 
     {0:1.0,1:10}, {0:1.0,1:99}, {0:1.0,1:100}, {0:1.0,1:150}, 
     {0:1.0,1:200}, {0:1.0,1:300},{0:1.0,1:400},{0:1.0,1:500}, 
     {0:1.0,1:1000}, {0:10,1:1000},{0:100,1:1000} ]
hyperparam_grid = {"class_weight": w }
                         
                    
lg3 = LogisticRegression(random_state = 13)
grid = GridSearchCV(lg3, hyperparam_grid, scoring = "f1", cv = 50, n_jobs = -1, refit = True)
grid.fit(X_train,y_train)
print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')
print("F1 Score is: " + str(f1_score(grid.predict(X_test), y_test)))
print("\n\n")

lg4 = LogisticRegression(random_state = 13, class_weight = {0: 100, 1: 1000}, n_jobs = -1)
lg4.fit(X_train,y_train)
print("F1 Score is: " + str(f1_score(lg4.predict(X_test), y_test)))
print("\n\n")

**OverSampling**

In [None]:
# As classification without sampling data works bad, let's try to make an RandomOverSampler or RandomUnderSampler. More information by the link below:
# https://towardsdatascience.com/oversampling-and-undersampling-5e2bbaf56dcf
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
print("Number of elements in classes before OverSampling:")
print(Counter(y))
print("\n\n")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12345)
ros = RandomOverSampler(random_state=0)
ros.fit(X, y)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
X_train = X_resampled
y_train = y_resampled

print("Number of elements in classes after OverSampling:")
print(Counter(y_resampled))

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification report for Naive Bayes Classifier:")
print(classification_report(y_test,y_pred))
plt.figure(figsize=(12,10))
plot_confusion_matrix(clf, X_test,y_test)
print("F1 Score is: " + str(f1_score(y_pred, y_test)))
print("\n\n")

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
reg = LinearRegression().fit(X_train, y_train)
y_pred = np.round(reg.predict(X_test))
print("Classification report for Linear Regression:")
print(classification_report(y_test,y_pred))
print("\n\n")

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)
y_pred = np.round(rf.predict(X_test))
print("Classification report for Random Forest Classifier:")
print(classification_report(y_test,y_pred))
plt.figure(figsize=(12,10))
plot_confusion_matrix(rf, X_test,y_test)
print("\n\n")

In [None]:
# KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification report for Naive Bayes Classifier:")
print(classification_report(y_test,y_pred))
plt.figure(figsize=(12,10))
plot_confusion_matrix(clf, X_test,y_test)
print("\n\n")


from sklearn.metrics import f1_score
print(f1_score(y_pred, y_test))
# Let`s look at depending F1 score on number of neighbours
f1_scores = []
for numb in range(1,30):
    clf = KNeighborsClassifier(n_jobs = -1, n_neighbors = numb)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1_scores = f1_scores + [f1_score(y_pred, y_test)]
plt.figure(figsize=(12,10))
plt.plot(list(range(1,30)), f1_scores)


# It looks like this classifier is bad for this task

In [None]:
# SVC 
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification report for SV Classifier:")
print(classification_report(y_test,y_pred))
plt.figure(figsize=(12,10))
plot_confusion_matrix(clf, X_test,y_test)
print("\n\n")

**Conclusion**

*In that case, the best model is NaiveBayes before oversampling with F1 score 0.56.*