In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
df = pd.read_csv('winequality-white.csv', sep=';')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [7]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [50]:
train_x = df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'density',
            'pH', 'sulphates', 'alcohol']]
train_y = df['quality']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.33, random_state=42)

In [52]:
scaler = StandardScaler()
train_x = scaler.fit(train_x)

### 1. Logistic Regression

In [13]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_hat)
print(accuracy)

mat = confusion_matrix(y_test, y_hat)
print(mat)

0.5231910946196661
[[  0   0   3   3   0   1]
 [  0   0  24  22   0   0]
 [  0   0 245 226   1   0]
 [  0   0 138 583  10   0]
 [  0   0  12 274  18   0]
 [  0   0   1  49   7   0]]


### Support Vector Machine

In [14]:
clf = svm.SVC()
clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_hat)
print(accuracy)

mat = confusion_matrix(y_test, y_hat)
print(mat)

0.5299938157081014
[[  0   0   1   6   0   0]
 [  0   3  24  19   0   0]
 [  0   2 227 239   4   0]
 [  0   0 134 565  30   2]
 [  0   0  22 221  61   0]
 [  0   0   4  35  17   1]]


### Linear Regression

In [53]:
clf = LinearRegression()
clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

for i in range(len(y_hat)):
    temp = int(y_hat[i])
    if temp+0.5 >= y_hat[i]:
        y_hat[i] = int(y_hat[i])
    else:
        y_hat[i] = int(y_hat[i])+1

accuracy = accuracy_score(y_test, y_hat)
print(accuracy)

mat = confusion_matrix(y_test, y_hat)
print(mat)

0.5163883735312307
[[  0   0   3   3   1   0]
 [  0   1  23  22   0   0]
 [  0   3 203 261   5   0]
 [  0   0 113 562  56   0]
 [  0   0  17 218  69   0]
 [  0   0   0  36  21   0]]


### Random Forest

In [28]:
clf = RandomForestClassifier(max_depth=15)
clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_hat)
print(accuracy)

mat = confusion_matrix(y_test, y_hat)
print(mat)

0.6246134817563389
[[  0   0   3   3   1   0]
 [  0   8  25  12   1   0]
 [  0   6 313 145   7   1]
 [  0   5 143 520  63   0]
 [  0   2  16 133 151   2]
 [  0   0   0  24  15  18]]


### K-Nearest Neighbour

In [36]:
clf = KNeighborsClassifier(n_neighbors=15)
clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_hat)
print(accuracy)

mat = confusion_matrix(y_test, y_hat)
print(mat)

0.49288806431663573
[[  0   0   1   6   0   0]
 [  0   6  24  16   0   0]
 [  0   8 223 226  13   2]
 [  0   1 186 496  46   2]
 [  0   1  31 200  71   1]
 [  0   0   6  31  19   1]]


### Decision Tree

In [37]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_hat)
print(accuracy)

mat = confusion_matrix(y_test, y_hat)
print(mat)

0.5875077303648732
[[  0   0   1   4   1   0   1]
 [  1  16  16   8   4   1   0]
 [  2  18 295 132  20   5   0]
 [  2  16 151 459  88  14   1]
 [  1   8  23 102 159  11   0]
 [  0   0   3  12  21  21   0]
 [  0   0   0   0   0   0   0]]


### Result Dataframe - After feature selection

In [54]:
results = pd.DataFrame({
    'Model': ['Logistic Regression','SVC', 'Linear Regression', 'Random Forest','KNN', 'Decision Tree'],
    'Score': [0.523,0.529,0.516,0.624,0.492,0.587]})

result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.624,Random Forest
0.587,Decision Tree
0.529,SVC
0.523,Logistic Regression
0.516,Linear Regression
0.492,KNN


### Results
After modifying features, the results did not change that much.
Random Forest still scored highest with 62.4%