In [54]:
import scipy
import random
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import time

start_time = time.time()
# data preprocessing
wineDF = pd.read_csv('winequality-red.csv', delimiter=',')        #convert the csv to a data frame

#separate input and outputs
wines = wineDF.values
X = wines[:,0:11]
Y = wines[:,-1]

#print(wines)
# Rescaling the inputs
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
np.set_printoptions(precision=3)

# transform wine quality to three classes
for i in range(len(Y)):
    if Y[i] <= 3:
        Y[i] = -1
    elif Y[i] >= 4 and Y[i] <=6:
        Y[i] = 0
    else:
        Y[i] = 1
        
#Xtrain, Xtest, Ytrain, Ytest = train_test_split(rescaledX, Y, test_size=0.6, random_state=1)

#Split the dataset
def splitDataset(rescaledX, splitRatio):
    trainSize = int(len(Y) * splitRatio)
    trainSet = []
    copy = list(rescaledX)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

#Separate the dataset
def separateByClass(wines):
    separate = {}
    for i in range(len(wines)):
        vector = wines[i]
        if (vector[-1] not in separate):
            separate[vector[-1]] = []
        separate[vector[-1]].append(vector)
    return separate


#Calculate mean
def mean(n):
    avg = sum(n)/float(len(n))
    return avg

#Calculate standard deviation
def stdev(n):
    average = mean(n)
    SD = math.sqrt(sum([pow(rescaledX-average,2) for rescaledX in n])/float(len(n)-1))
    return SD

#Return mean and stdev of attributes without the Class
def summarize(X):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*X)]
    del summaries[-1]
    return summaries
    
def summarizeByClass(wines):
    separate = separateByClass(wines)
    summaries = {}
    for classValue, instances in separate.items():
        summaries[classValue] = (summarize(instances))
    return summaries
    
def calProbability(rescaledX, mean, stdev):
    prob = (1 / (math.sqrt(2*math.pi) * stdev)) * math.exp(-(math.pow(rescaledX-mean,2)/(2*math.pow(stdev,2))))
    return prob

#Calculate probabilities of attributes
def calcClassProbability(summaries, Xinput):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            rescaledX = Xinput[i]
            probabilities[classValue] *= calProbability(rescaledX, mean, stdev)
    return probabilities
            
def predict(summaries, Xinput):
    probabilities = calcClassProbability(summaries, Xinput)
    bestLabel, bestProbability = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProbability:
            bestProbability = probability
            bestLabel = classValue
    return bestLabel

#Predicting using the testSet
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions
 
#Accuracy of the model
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

def nBayes():
    splitRatio = 0.9
    trainingSet, testSet = splitDataset(wines, splitRatio)
    print('Split {0} rows into:'.format(len(wines)))
    print('trainSet rows = {0}'.format(len(trainingSet)))
    print('and testSet rows = {0}'.format(len(testSet)))
    summaries = summarizeByClass(trainingSet)
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%'.format(accuracy))
    print('Predictions: {0}'.format(predictions))
    
nBayes()

Split 1599 rows into:
trainSet rows = 1439
and testSet rows = 160
Accuracy: 60.0%
Predictions: [0.0, 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, -1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, -1.0, 1.0, 0.0, -1.0, 0.0, -1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, -1.0, 1.0, 0.0, 0.0, -1.0, 0.0, -1.0, -1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, -1.0, 0.0, 0.0, 1.0, 0.0, 0.0, -1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 1.0, 0.0, 0.0, -1.0]
