In [31]:
# import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn import manifold

# import decision tree model from scikit-learn
from sklearn import tree

# import metrics to use cross-validation
from sklearn import metrics

%matplotlib inline

In [6]:
'''Import dataset
The link to the dataset can be found here:
https://archive.ics.uci.edu/ml/datasets/wine+quality'''

df = pd.read_csv("winequality-red.csv", sep = ';')

In [7]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [8]:
# size of the dataset
df.shape

(1599, 12)

In [11]:
# Any empty value?
df.isnull().count()

fixed acidity           1599
volatile acidity        1599
citric acid             1599
residual sugar          1599
chlorides               1599
free sulfur dioxide     1599
total sulfur dioxide    1599
density                 1599
pH                      1599
sulphates               1599
alcohol                 1599
quality                 1599
dtype: int64

In [15]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [24]:
# quality of wine is graded from 0 to 10 but this dataset hase only 6 grades.
# so we can classify them from 0 to 5
df.quality.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [25]:
# We need a dictionary to map 0-5 into unique values of quality grades.
quality_dic = {3:0, 4:1, 5:2, 6:3, 7:4, 8:5}

In [26]:
# Lets change quality grades according to our dictionary
df['quality'] = df.quality.map(quality_dic)

In [27]:
# Now we see that quality is graded from 0-5
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2


In [28]:
# Split dataset into test and training, 1000 rows for training and 599 for testing
# 1- shuffle dataset
df = df.sample(frac = 1).reset_index(drop=True)

# Lets check to make sure it is shuffled
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.1,0.67,0.0,2.3,0.083,18.0,27.0,0.99768,3.44,0.54,9.4,2
1,7.2,0.41,0.3,2.1,0.083,35.0,72.0,0.997,3.44,0.52,9.4,2
2,7.3,0.32,0.23,2.3,0.066,35.0,70.0,0.99588,3.43,0.62,10.1,2
3,5.4,0.42,0.27,2.0,0.092,23.0,55.0,0.99471,3.78,0.64,12.3,4
4,9.9,0.54,0.45,2.3,0.071,16.0,40.0,0.9991,3.39,0.62,9.4,2


In [29]:
df_train = df.head(1000)
df_test = df.tail(599)

We train the model using decision tree 

In [32]:
# Initialize the decision tree classifier
clf = tree.DecisionTreeClassifier(max_depth = 3)

In [33]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [48]:
x_train = df_train.iloc[:, :-1]
y_train = df_train['quality']

x_test = df_test.iloc[:, :-1]
y_test = df_test['quality']

In [58]:
# Fit the model using all 11 features of the dataset
clf.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [60]:
# Prediction on training and test datasets
train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)

In [None]:
# Accuracy of our predictions:
  