## Implementation of Naive Bayes

In [14]:
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [15]:
data = pd.read_csv('C:/Users/adm/Documents/Python Scripts/Dataset/winequality_clean.csv')

In [16]:
data.head()

Unnamed: 0,ID,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,W0001,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,2
1,W0002,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,,9.5,2
2,W0003,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,,10.1,2
3,W0004,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,2
4,W0005,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,2


In [17]:
data.loc[(data.quality == 2), 'quality'] = 0
data.fillna(data.mean(), inplace=True)

In [18]:
X = data.drop(['ID', 'quality'], axis=1)
y = data.quality

In [19]:
y.value_counts()

0    3258
1    1640
Name: quality, dtype: int64

In [20]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

In [21]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(2938, 11)
(1960, 11)
(2938,)
(1960,)


Here we see that logistic regression has a slightly better accuracy.

In [28]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_test, gnb.predict(X_test))

print(classification_report(y_test, y_pred))


             precision    recall  f1-score   support

          0       0.70      0.73      0.72      1282
          1       0.73      0.70      0.71      1325

avg / total       0.72      0.72      0.72      2607



In [23]:
logReg = LogisticRegression()
logReg.fit(X_train, y_train)

from sklearn.metrics import accuracy_score 

y_pred = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score 
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X, y)
# y_pred = knn.predict(X)
# print(accuracy_score(y, y_pred))

0.6979591836734694
             precision    recall  f1-score   support

          0       0.76      0.79      0.78      1290
          1       0.56      0.52      0.54       670

avg / total       0.69      0.70      0.69      1960



In [24]:
from sklearn.utils import resample

data_majority = data[data['quality']==0]
data_minority = data[data['quality']==1]

data_minority_upsampled = resample(data_minority,
replace=True,
n_samples=3258, #same number of samples as majority classe
random_state=1) #set the seed for random resampling
# Combine resampled results
data_upsampled = pd.concat([data_majority, data_minority_upsampled])

data_upsampled['quality'].value_counts()

1    3258
0    3258
Name: quality, dtype: int64

In [25]:
X = data_upsampled.drop(['ID', 'quality'], axis=1)
y = data_upsampled.quality

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_pred = gnb.predict(X_test)
print (accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

(3909, 11)
(2607, 11)
(3909,)
(2607,)
0.7161488300728807
             precision    recall  f1-score   support

          0       0.70      0.73      0.72      1282
          1       0.73      0.70      0.71      1325

avg / total       0.72      0.72      0.72      2607



In [26]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.70      0.73      0.72      1282
          1       0.73      0.70      0.71      1325

avg / total       0.72      0.72      0.72      2607

