In [None]:
# This is a Python 3 environment

import numpy as np # linear algebra
import pandas as pd # data processing

#libraries for visualizations
import matplotlib.pyplot as plt
import seaborn as sns

#libraries for prediction


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

import os
print(os.listdir("../input"))



white wine dataset:

In [None]:
ww_df=pd.read_csv('../input/winequality-white.csv', delimiter=';')
ww_df.head()
ww_df.describe()

red wine dataset:

In [None]:
rw_df=pd.read_csv('../input/winequality-red.csv', delimiter=';')
rw_df.head()
rw_df.describe()

In [None]:
ww_df.shape

In [None]:
rw_df.shape

In [None]:
ww_df.isnull().sum()


In [None]:
rw_df.isnull().sum()

These datasets don't have null values.

Red Wine Dataset is much smaller than the white wine dataset.

I don't combine the 2 datasets because what is considered a good feature for a red wine could be a bad feature in a white wine. As a wine lover I can say that white and red are really two different things and we should never mix them, at dinner or in a datascience competition :) 
Let's do a histogram of the target variable: quality.

In [None]:
plt.title('Wine Quality')
plt.hist(ww_df['quality'],  alpha=0.5, label='white wine')
plt.hist(rw_df['quality'],  alpha=0.5, label='red wine')
plt.legend(loc='upper right')
plt.show()

They have similar distribution of the target variable.

Let's see which variables have the highest correlation with the target one.

In [None]:
labels = []
values = []
for col in ww_df.columns:
    if col not in ["quality"]:
        labels.append(col)
        values.append(np.corrcoef(ww_df[col].values, ww_df["quality"].values)[0,1])
corr_df = pd.DataFrame({'columns_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
 
corr_df = corr_df[(corr_df['corr_values']>0.20) | (corr_df['corr_values']<-0.20)]
ind = np.arange(corr_df.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(10,6))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='gold')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.columns_labels.values, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient of the variables with the target variable for white wine")


plt.show()
labels = []
values = []
for col in rw_df.columns:
    if col not in ["quality"]:
        labels.append(col)
        values.append(np.corrcoef(rw_df[col].values, rw_df["quality"].values)[0,1])
corr_df = pd.DataFrame({'columns_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
 
 
corr_df = corr_df[(corr_df['corr_values']>0.20) | (corr_df['corr_values']<-0.20)]
ind = np.arange(corr_df.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(10,6))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='r')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.columns_labels.values, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient of the variables with the target variable for red wine")
plt.show()

As we can see the variables that correlates the most with the target variables are different in the 2 datasets. White wines quality have a positive correlation with alchool and a negative correlation with chlorides and density. Red wines quality have a positive correlations with alchool, suplhates and citric acid but a negative correlation with volatile acidity.

In [None]:
ww_df['quality'].loc[ww_df['quality']>7] = 7
plt.figure(figsize=(12,8))
sns.violinplot(x='quality', y='alcohol', data=ww_df)
plt.xlabel('quality', fontsize=12)
plt.ylabel('alcohol', fontsize=12)
plt.title('Violin plot quality and alcohol for white wines')
plt.show()

rw_df['quality'].loc[rw_df['quality']>7] = 7
plt.figure(figsize=(12,8))
sns.violinplot(x='quality', y='alcohol', data=rw_df)
plt.xlabel('quality', fontsize=12)
plt.ylabel('alcohol', fontsize=12)
plt.title('Violin plot quality and alcohol for red wines')
plt.show()

We can see that the relationship with the variable alcohol is very similar in the two datasets

Let's implement the matrix correlation for the two datasets.

In [None]:
temp_dfww = ww_df.drop('quality', 1)
corrmatww = temp_dfww.corr(method='pearson')
f, ax = plt.subplots(figsize=(12, 12))

# Draw the heatmap using seaborn
sns.heatmap(corrmatww, vmax=1., square=True, cmap="YlOrRd")
plt.title("Correlation Matrix White Wine", fontsize=15)
plt.show()


temp_dfrw =  rw_df.drop('quality', 1)
corrmatrw = temp_dfrw.corr(method='pearson')
f, ax = plt.subplots(figsize=(12, 12))

# Draw the heatmap using seaborn
sns.heatmap(corrmatrw, vmax=1., square=True, cmap="YlOrRd")
plt.title("Correlation Matrix Red Wine", fontsize=15)
plt.show()

We can see that sulphates and citric acid have a quite high correlation in Red Wines but not in White Wines. 

In [None]:
col = "total sulfur dioxide"
ulimit = np.percentile(ww_df[col].values, 99.5)
llimit = np.percentile(ww_df[col].values, 0.5)
rw_df[col].loc[ww_df[col]>ulimit] = ulimit
rw_df[col].loc[ww_df[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(x=ww_df[col].values, y=ww_df.density.values, height=10)
plt.ylabel('density', fontsize=12)
plt.xlabel('sulphates', fontsize=12)
plt.title("citric acid Vs sulphates", fontsize=15)
plt.show()

From the graph is clear that there isn't a linear correlation between sulphates and citric acid. 

****PREDICTION FOR WHITE WINE DATASET**

To predict the target variable (quality) I divide the white wine dataset into training and test set. 30% into test set and 70% into training set. I will build the model on my training set and I will test the accuracy of it using my test set. 
Since my target variable is categorical I can try to apply a classification algorithm. 

In [None]:
yww = ww_df.quality # define the target variable (dependent variable) as y

In [None]:
ww_df_noy = ww_df.drop('quality', 1)

#x_col = ['alcohol', 'chlorides', 'density']
#ww_df_noy  = ww_df_noy [x_col]

(xww_train, xww_test, yww_train, yww_test) = train_test_split(ww_df_noy, yww, test_size=0.3)

print (xww_train.shape, yww_train.shape)
print (xww_test.shape, yww_test.shape)

Linear Regression and methods that are using a regressors as a techniques didn't provide a good level of accuracy. Therefore, since my target variable is categorical I used a tree-based technique, a RandomForestClassifier. I didn't applied crossvalidation beacuse with RandomForest it is not necessary.

In [None]:
np.random.seed(1)
my_model = RandomForestClassifier()
my_model.fit(xww_train, yww_train)


In [None]:

# Use the model to make predictions
predicted_prices = my_model.predict(xww_test)
predicted_prices_int=predicted_prices.astype(int)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices_int)



In [None]:
accuracy = accuracy_score(yww_test,predicted_prices_int)
accuracy

In [None]:
from sklearn.metrics import classification_report
report = classification_report(yww_test,predicted_prices_int)
print(report)

In conclusion  white wine quality target is positively correlated with the amount of alcohol and negatively correlated with chlorides and density. A linear-based models are not sufficient to create a good prediction of the quality and, since the target is also a categorical variable, I used a tree-based model for it. I divide the dataset in training and test set (CrossValidation is not necessary with randomforest) and I obtained an accuracy of almost 70%. I used all the variables because the dataset is very small. I checked and selecting only the most relevant the accuracy didn't increased.

**RED WINE PREDICTION**

In [None]:
yrw = rw_df.quality # define the target variable (dependent variable) as y
rw_df_noy = rw_df.drop('quality', 1)


(xrw_train, xrw_test, yrw_train, yrw_test) = train_test_split(rw_df_noy, yrw, test_size=0.3)

print (xrw_train.shape, yrw_train.shape)
print (xrw_test.shape, yrw_test.shape)

In [None]:
my_model = RandomForestClassifier()
my_model.fit(xrw_train, yrw_train)

In [None]:
predicted_prices = my_model.predict(xrw_test)
predicted_prices_int=predicted_prices.astype(int)


In [None]:
accuracy = accuracy_score(yrw_test,predicted_prices_int)
accuracy

For the red wine datasets I used the same approach and I obtained an accuracy of almost 70%.