In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing Required Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import norm, boxcox
from scipy import stats
from imblearn.over_sampling import SMOTE


**Loading the red wine quality dataset**

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

# **Exploring the dataset**

**Checking No. of rows and columns**

In [None]:
print('Shape of the dataset')
print(df.shape)


**Checking for null values**

In [None]:
print('Checking for the null values')
print(df.isnull().sum())

There is no null values in the dataset.

**Checking the data types of each feautes and nows of rows and null count**

In [None]:
df.info()

**Cheking the no. of rows, mean value of each rows, std and five number summary**

In [None]:
df.describe()

**Quality count**

In [None]:
sns.countplot(x ='quality', data = df)

In [None]:
from collections import Counter
Counter(df['quality'])

**checking the relation between the alcohol and Quality**

In [None]:
sns.boxplot(x="quality",y="alcohol", data=df)

We can see that as we are increasing the alcohol the quality of wine is improving.

**checking the relation between the residual sugar and Quality**

In [None]:
plot = plt.figure(figsize=(15,7))
sns.boxplot(x="quality",y="residual sugar", data=df)

we can see that there is no correlation between the residual sugar and quality.
So, we will drop this column as it has not contribution in determining the qulaity.

**Instead of checking one by one we can use correlation matrix and heat map to analyse the correlation between each feature at once**

**checking the relation between the volatile acidity and Quality**

In [None]:
sns.boxplot(x="quality",y="volatile acidity", data=df)

**checking the relation between the pH acidity and Quality**

In [None]:
plot = plt.figure(figsize=(15,7))
sns.boxplot(x="quality",y="pH", data=df)

In [None]:
plot = plt.figure(figsize=(15,7))
sns.barplot(x="quality",y="pH", data=df)

In [None]:
correlation = df.corr()
plt.figure(figsize=(14,14))
sns.heatmap(correlation, cbar=True, square=True, fmt = '.2f', annot = True, annot_kws={'size':15}, cmap = 'coolwarm')

**Treating outliers**

In [None]:
cols = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'] # one or more

Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

**Checking for skewness**

1. fixed acidity

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["fixed acidity"], fit=norm, color="orange")
plt.title("fixed acidity Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["fixed acidity"], plot = plt)
plt.show()

Fixing skewness using boxcox

In [None]:
df["fixed acidity"], lam_fixed_acidity = boxcox(df["fixed acidity"])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["fixed acidity"], fit=norm, color="orange")
plt.title("fixed acidity Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["fixed acidity"], plot = plt)
plt.show()

2. residual sugar

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["residual sugar"], fit=norm, color="orange")
plt.title("residual sugar Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["residual sugar"], plot = plt)
plt.show()

Fixing skewness using boxcox

In [None]:
df["residual sugar"], lam_fixed_acidity = boxcox(df["residual sugar"])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["residual sugar"], fit=norm, color="orange")
plt.title("residual sugar Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["residual sugar"], plot = plt)
plt.show()

3. free sulfur dioxide

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["free sulfur dioxide"], fit=norm, color="orange")
plt.title("free sulfur dioxide Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["free sulfur dioxide"], plot = plt)
plt.show()

In [None]:
df["free sulfur dioxide"], lam_fixed_acidity = boxcox(df["free sulfur dioxide"])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["free sulfur dioxide"], fit=norm, color="orange")
plt.title("free sulfur dioxide Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["free sulfur dioxide"], plot = plt)
plt.show()

4. total sulfur dioxide

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["total sulfur dioxide"], fit=norm, color="orange")
plt.title("total sulfur dioxide Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["total sulfur dioxide"], plot = plt)
plt.show()

In [None]:
df["total sulfur dioxide"], lam_fixed_acidity = boxcox(df["total sulfur dioxide"])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["total sulfur dioxide"], fit=norm, color="orange")
plt.title("total sulfur dioxide Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["total sulfur dioxide"], plot = plt)
plt.show()

5. alcohol

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["alcohol"], fit=norm, color="orange")
plt.title("alcohol Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["alcohol"], plot = plt)
plt.show()

In [None]:
df["alcohol"], lam_fixed_acidity = boxcox(df["alcohol"])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["alcohol"], fit=norm, color="orange")
plt.title("alcohol Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["alcohol"], plot = plt)
plt.show()

Preprocessing the data

In [None]:
X = df.drop('quality',axis=1)

# **Multiclassification**

In [None]:
# diving
reviews = []
for i in df['quality']:
    if i >= 3 and i <= 4:
        reviews.append('1')
    elif i >= 5 and i <= 7:
        reviews.append('2')
    elif i == 8:
        reviews.append('3')
df['Reviews'] = reviews

In [None]:

Counter(df['Reviews'])

# Standarisation
Standardization comes into picture when features of input data set have large differences between their ranges, or simply when they are measured in different measurement units (e.g., Pounds, Meters, Miles … etc).

These differences in the ranges of initial features causes trouble to many machine learning models. For example, for the models that are based on distance computation, if one of the features has a broad range of values, the distance will be governed by this particular feature.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)


**PCA for Dimension Reduction**

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(X)


In [None]:
#Plot the graph to find the principal components
plt.figure(figsize=(10,10))
plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-')
plt.grid()

In [None]:
#AS per the graph, we can see that 8 principal components attribute for 90% of variation in the data. 
#we shall pick the first 8 components for our prediction.
pca_new = PCA(n_components=8)
X_new = pca_new.fit_transform(X)


**Spliting the data in train and test**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_new, reviews, test_size = 0.25)

**SMOTE**
Smote is a technique which is used when there is imbalance in dataset. 
Fixing the imbalance dataset.

In [None]:
sm = SMOTE(random_state=14)
X_train, Y_train = sm.fit_resample(X_train, Y_train)

In [None]:
import collections
print("change in value of count after using smote:", collections.Counter(Y_train))

**Random forest classifier**

In [None]:
model = RandomForestClassifier()
model.fit(X_train, Y_train)

**Checking the accuracy**

In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy : ', test_data_accuracy)