In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv('../input/winequality-red.csv')


**using .head() we display the first 5 rows of the dataset. **

**For example to display 10 rows , data.head(10)**


In [None]:
data.head()

In [None]:
data.columns

*We will find the unique values of the quality we will be predicting  , this helps us to know the different types of existing values.*


In [None]:
data['quality'].unique()

Checking Missing values in our dataset.

https://stackoverflow.com/questions/29530232/how-to-check-if-any-value-is-nan-in-a-pandas-dataframe

In [None]:
data.isnull().values.any()


*Generates descriptive statistics that summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values.*

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html

In [None]:
data.describe()

*We checked the different types of value before , let's no also check how many are those .*


In [None]:
data['quality'].value_counts()

*We use inbuilt pandas visualization to check the number of different values in quality . *

To learn more check :

https://www.kaggle.com/learn/data-visualisation

https://pandas.pydata.org/pandas-docs/stable/visualization.html

In [None]:
data['quality'].hist(figsize=(12,5))

*It is useful to see the average content of different components in  Wine for analysis.*

data['quality']==3 , this is just like a condition , meaning get me the data for those whose quality value is 3.



In [None]:

data[data['quality']==3].mean()

Let's say you want mean for not just one type of quality but for every quality.

It can be done using pandas groupby. 

For a quick guide to pandas :

https://www.kaggle.com/kashnitsky/topic-1-exploratory-data-analysis-with-pandas


In [None]:
data.groupby(['quality']).mean()

In [None]:
data.apply(np.max) 

https://seaborn.pydata.org/generated/seaborn.FacetGrid.html



In [None]:
sns.FacetGrid(data,hue='quality',height=7).map(plt.scatter,"quality","fixed acidity","pH")\
.add_legend()

In [None]:
data['pH'].plot(kind='density', subplots=True, layout=(1, 2), sharex=False, figsize=(16, 6));

In [None]:
data[['fixed acidity','pH','alcohol','sulphates']].hist(figsize=(12, 4));

In [None]:
data[['fixed acidity','pH','alcohol','sulphates']].plot.hist(stacked=True)

In [None]:
import seaborn as sns
_, ax = plt.subplots(figsize=(6, 6))
sns.boxplot(data=data['alcohol'], ax=ax);

In [None]:
from pandas.plotting import radviz
radviz(data,'quality')

In [None]:
corr_matrix = data.corr()
sns.heatmap(corr_matrix);

In [None]:
sns.boxplot(x='quality',y='alcohol',data=data)


In [None]:
 data.plot.scatter(x='quality', y='pH', color='Green',figsize=(8, 8))

In [None]:
#Composition of citric acid go higher as we go higher in the quality of the wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'citric acid', data = data)

**We can observe that a better quality wine tends to have more citric acid in it.**

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'pH', data = data)

In [None]:
sns.pairplot(data.iloc[:,0:3], height=3)

*Now we come to our data preprocessing part . Our goal will be to detect whether the wine is Good , Bad or Ok .

0 is bad i.e  quality 3&4
1 is ok i.e quality  5&6
2 is good i.e quality 7&8

First we replace the values as per our goal using the map function .*

In [None]:
d={3:0,4:0,5:1,6:1,7:2,8:2}
data['quality']=data['quality'].map(d)

Let's check again , how many of them are 1,2 and 0 in our labels now using seaborn count plot.


In [None]:
fig,axes = plt.subplots(figsize=(6,8))
sns.countplot(x='quality',data=data,ax=axes)

We seprate our features and labels now.

.values just means the type of features and labels will be now ndarray. 

In [None]:
features=data.iloc[:,0:-1].values
labels=data.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test  \
= train_test_split(features, labels, test_size = 0.2,  )

*If you look at the dataset again you will notice that the values in some cases differ by much.
The model tends to perform better when the values are more comparable .
So to solve this problem we use feature scaling , the one here used is Standard Scalar.*

https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html
https://en.wikipedia.org/wiki/Feature_scaling

https://stackoverflow.com/questions/40758562/can-anyone-explain-me-standardscaler
    

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
https://medium.com/machine-learning-101/chapter-5-random-forest-classifier-56dc7425c3e1
https://www.analyticsvidhya.com/blog/2016/04/complete-tutorial-tree-based-modeling-scratch-in-python/

In [None]:
from sklearn.ensemble import RandomForestClassifier
randomclassifier = RandomForestClassifier(n_estimators=200)
randomclassifier.fit(features_train,labels_train)

score2=randomclassifier.score(features_train,labels_train)
print('Score on Training Data')
print(score2)
score=randomclassifier.score(features_test,labels_test)
print('Score on Testing Data')
print(score)

In [None]:
labels_pred1 = randomclassifier.predict(features_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
CM = confusion_matrix(labels_test, labels_pred1)
print(CM)
sns.heatmap(CM, vmin=0, vmax=5,annot=True,fmt="d",cmap="OrRd")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(labels_test,labels_pred1))

https://www.analyticsvidhya.com/blog/2017/09/understaing-support-vector-machine-example-code/

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'poly' )
classifier.fit(features_train, labels_train)

# Predicting the Test set results
labels_pred = classifier.predict(features_test)

# Model Score on testing data
score3 = classifier.score(features_train,labels_train)
print('Score on Training data')
print(score3)

# Model Score on training data
score4 = classifier.score(features_test,labels_test)
print('Score on testing data')
print(score4)

In [None]:
cm = confusion_matrix(labels_test, labels_pred)
print(cm)
sns.heatmap(cm, vmin=0, vmax=5,annot=True,fmt="d",cmap="terrain")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(labels_test,labels_pred))