In [1]:
import sklearn
from sklearn import svm
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import math
import random
import numpy as np
import pandas as pd
from pandas import read_csv
from pandas.plotting import scatter_matrix
import matplotlib
from matplotlib import pyplot

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/winequality-red.csv"
names = ['fixed', 'volatile', 'citric', 'residual-sugar', 'chloride','total-sulphur','density','pH','sulphates','alcohol','quality']
dataset = read_csv(url, names=names)

In [3]:
reviews = []
for i in dataset['quality']:
    if i>=1 and i<=3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
dataset['Reviews'] = reviews

In [4]:
dataset.columns

Index(['fixed', 'volatile', 'citric', 'residual-sugar', 'chloride',
       'total-sulphur', 'density', 'pH', 'sulphates', 'alcohol', 'quality',
       'Reviews'],
      dtype='object')

In [5]:
#shape
print(dataset.shape)
# head
print(dataset.head(20))
# descriptions
print(dataset.describe())
# class distribution
print(dataset.groupby('quality').size())

(1599, 12)
      fixed  volatile  citric  residual-sugar  chloride  total-sulphur  \
7.4   0.700      0.00     1.9           0.076      11.0           34.0   
7.8   0.880      0.00     2.6           0.098      25.0           67.0   
7.8   0.760      0.04     2.3           0.092      15.0           54.0   
11.2  0.280      0.56     1.9           0.075      17.0           60.0   
7.4   0.700      0.00     1.9           0.076      11.0           34.0   
7.4   0.660      0.00     1.8           0.075      13.0           40.0   
7.9   0.600      0.06     1.6           0.069      15.0           59.0   
7.3   0.650      0.00     1.2           0.065      15.0           21.0   
7.8   0.580      0.02     2.0           0.073       9.0           18.0   
7.5   0.500      0.36     6.1           0.071      17.0          102.0   
6.7   0.580      0.08     1.8           0.097      15.0           65.0   
7.5   0.500      0.36     6.1           0.071      17.0          102.0   
5.6   0.615      0.00     1

In [6]:
dataset['Reviews'].unique()

array(['2', '3', '1'], dtype=object)

In [7]:
x = dataset.iloc[:,:11]
y = dataset['Reviews']

In [8]:
x.head(10)

Unnamed: 0,fixed,volatile,citric,residual-sugar,chloride,total-sulphur,density,pH,sulphates,alcohol,quality
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [9]:
y.head(10)

7.4     2
7.8     2
7.8     2
11.2    2
7.4     2
7.4     2
7.9     2
7.3     2
7.8     2
7.5     2
Name: Reviews, dtype: object

In [10]:
sc = StandardScaler()
x = sc.fit_transform(x)

In [11]:
print(x)

[[ 0.96187667 -1.39147228 -0.45321841 ... -0.57920652 -0.96024611
  -0.78782264]
 [ 1.96744245 -1.39147228  0.04341614 ...  0.1289504  -0.58477711
  -0.78782264]
 [ 1.29706527 -1.18607043 -0.16942723 ... -0.04808883 -0.58477711
  -0.78782264]
 ...
 [-0.09955388 -0.72391627 -0.16942723 ...  0.54204194  0.54162988
   0.45084835]
 [ 0.65462046 -0.77526673 -0.38227061 ...  0.30598963 -0.20930812
  -0.78782264]
 [-1.21684919  1.02199944  0.75289408 ...  0.01092425  0.54162988
   0.45084835]]


In [12]:
pca = PCA()
x_pca = pca.fit_transform(x)

In [13]:
pca_new = PCA(n_components=8)
x_new = pca_new.fit_transform(x)

In [14]:
print(x_new)

[[-1.87761859  1.1924937  -1.09932272 ... -0.9924877   0.10611918
  -0.15307201]
 [-0.92435907  2.08738953  0.14544332 ...  0.48709863 -1.27230599
   0.42065965]
 [-0.92897041  1.41071244 -0.58312215 ...  0.16894209 -0.6964114
   0.32569739]
 ...
 [-0.72545087 -0.5125834   1.2099803  ... -0.58025824 -0.07422021
  -0.21241432]
 [-1.72667294  0.72634347  1.01356296 ... -0.81790088  0.66161291
   0.09719617]
 [ 0.50917641 -1.25797039  0.95124124 ... -0.00744379  0.94900514
  -0.22486104]]


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size = 0.25)

In [17]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1199, 8)
(1199,)
(400, 8)
(400,)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [19]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_predict = lr.predict(x_test)



In [21]:
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print(lr_conf_matrix)
print(lr_acc_score*100)

[[  0   6   0]
 [  0 392   0]
 [  0   2   0]]
98.0


In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
dt_predict = dt.predict(x_test)

In [24]:
dt_conf_matrix = confusion_matrix(y_test, dt_predict)
dt_acc_score = accuracy_score(y_test, dt_predict)
print(dt_conf_matrix)
print(dt_acc_score*100)

[[  0   6   0]
 [  0 392   0]
 [  0   0   2]]
98.5


In [25]:
nb = GaussianNB()
nb.fit(x_train,y_train)
nb_predict=nb.predict(x_test)

In [26]:
nb_conf_matrix = confusion_matrix(y_test, nb_predict)
nb_acc_score = accuracy_score(y_test, nb_predict)
print(nb_conf_matrix)
print(nb_acc_score*100)

[[  1   5   0]
 [  0 392   0]
 [  0   0   2]]
98.75


In [27]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_predict=rf.predict(x_test)



In [28]:
rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_acc_score = accuracy_score(y_test, rf_predict)
print(rf_conf_matrix)
print(rf_acc_score*100)

[[  0   6   0]
 [  0 392   0]
 [  0   1   1]]
98.25


In [29]:
from sklearn.svm import SVC

In [30]:
lin_svc = SVC()
lin_svc.fit(x_train, y_train)
predict=lin_svc.predict(x_test)



In [31]:
lin_svc_conf_matrix = confusion_matrix(y_test, rf_predict)
lin_svc_acc_score = accuracy_score(y_test, rf_predict)
print(lin_svc_conf_matrix)
print(lin_svc_acc_score*100)

[[  0   6   0]
 [  0 392   0]
 [  0   1   1]]
98.25


In [32]:
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         6
           2       0.98      1.00      0.99       392
           3       1.00      1.00      1.00         2

    accuracy                           0.98       400
   macro avg       0.66      0.67      0.66       400
weighted avg       0.97      0.98      0.98       400



  'precision', 'predicted', average, warn_for)
