In [74]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from numpy import inf
import numpy as np
import pandas as pd
from math import log
import math

In [103]:
class NaiveBayesClassifier():
    def fit(self,X,y):
        self.X, self.y = X,y
        self.parameters = []
        self.classes = np.unique(y)
        
        for i,c in enumerate(self.classes):
            X_where_c = X[np.where(y == c)]
            self.parameters.append([])
            
            for column in X_where_c.T:
                parameter_col = {"mean": column.mean(), "var": column.var()}
                self.parameters[i].append(parameter_col)
                
    def _calculate_prior(self,c):
        """ Calculate the prior of class c
        (samples where class == c / total number of samples)"""
        return np.mean(self.y == c)
    
    def _likelihood(self,mean,var,feature):
        """ Gaussian likelihood of the data x given mean and var """
        eps = 1e-4 # Added in denominator to prevent division by zero
        coeff = 1.0 / math.sqrt(2.0 * math.pi * var + eps)
        exponent = math.exp(-(math.pow(feature - mean, 2))/(2 * var + eps))
        return coeff * exponent

        
    def _classify(self, features):
        """ Classification using Bayes Rule P(Y|X) = P(X|Y)*P(Y)/P(X),
            or Posterior = Likelihood * Prior / Scaling Factor

        P(Y|X) - The posterior is the probability that sample x is of class y given the
                 feature values of x being distributed according to distribution of y and the prior.
        P(X|Y) - Likelihood of data X given class distribution Y.
                 Gaussian distribution (given by _calculate_likelihood)
        P(Y)   - Prior (given by _calculate_prior)
        P(X)   - Scales the posterior to make it a proper probability distribution.
                 This term is ignored in this implementation since it doesn't affect
                 which class distribution the sample is most likely to belong to.

        Classifies the sample as the class that results in the largest P(Y|X) (posterior)
        """
        aposteriors = []
        for i,c in enumerate(self.classes):
            #posterior = self._calculate_prior(c)
            posterior = self._calculate_prior(c)
                        
            for feature_value,params in zip(features, self.parameters[i]):
                likelihood = self._likelihood(params["mean"], params["var"], feature_value)                
                posterior *= likelihood
                #posterior *= likelihood
                
            aposteriors.append(posterior)
        
        return self.classes[np.argmax(aposteriors)] 
        
        
    def _predict(self, obj):
        return [self._classify(features) for features in obj]


In [104]:
dataset_iris = load_iris()

#X = dataset_iris["data"]
#y = dataset_iris["target"]

wine_data = pd.read_csv('../datasets/wine_data.csv', sep='\t', header=0)
wine_data = wine_data.sample(frac=1)


#X = wine_data.loc[wine_data['quality'] !=3].loc[wine_data['quality'] !=9]
#X = wine_data.drop(np.where(wine_data['quality'] == 3 | wine_data['quality'] == 9))

X = wine_data.drop(wine_data[(wine_data.quality == 3) & (wine_data.quality == 9)].index)

y = X.iloc[:,-1].values
X = X.iloc[:,1:-1].values

X_normalized = (X - X.mean(axis=0))/X.std(axis=0)

X_train,X_test,y_train,y_test = train_test_split(X_normalized,y, test_size=0.5)

In [105]:
nbc = NaiveBayesClassifier()
nbc.fit(X_train,y_train)

In [106]:
y_pred = nbc._predict(X_test)

In [107]:
accuracy = sum(y_pred == y_test)/len(y_test)

In [108]:
print(accuracy)

0.45891043397968606


In [96]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine

wine = load_wine()


print(wine)

X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, test_size=0.4, random_state=91
)
n_classes = np.unique(wine.target)

model = GaussianNB()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_pred,y_test)))

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
 

In [None]:
print("Accuracy: {}".format(sum(y_pred == y_test)/len(y_test)))
