In [1]:
import numpy as np
from scipy.stats import norm, multivariate_normal

In [2]:
def get_data(N=500):
    N = N # Number of Samples
    D = 2 #Dimensions
    # Random data points
    half = N//2
    X = np.random.randn(N, D)

    # Dividing above samples into 2 different gaussian blobs
    X[ :half, : ] = X[ :half, : ] - 2*np.ones((half, D))
    X[half:, : ] = X[half:, : ] + 2*np.ones((half, D))

    # Creating targets for above 2 blobs
    Y = np.array([0]*half + [1]*half)

    bias_term = np.ones((N, 1))

    # Adding bias term to our data
    Xbias = np.concatenate((X, bias_term), axis=1)

    return Xbias, Y

In [3]:
class NaiveBayes():

    def fit(self, X, Y, smoothing=10e-1):
        self.gaussians = {}
        self.priors = {}

        labels = set(Y)
        for c in labels:
            current_x = X[Y==c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'variance': current_x.var(axis=0) + smoothing,
            }

            self.priors[c] = float(len(Y[Y==c])) / len(Y)

    def score(self, X, Y):
        predictions = self.predict(X)
        return np.mean( Y == predictions )

    def predict(self, X):
        N, D = X.shape
        k = len(self.gaussians)
        p = np.zeros((N, k))
        for c, g in self.gaussians.items():
            mean, var = g['mean'], g['variance']
            p[ :, c] = multivariate_normal.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        
        return np.argmax(p, axis=1)


In [4]:
classifier = NaiveBayes()

X_train, Y_train = get_data(10000) 

X_test, Y_test = get_data(2000)

classifier.fit(X_train, Y_train)

print(f'Accuracy on training set {classifier.score(X_train, Y_train)}')

preds = classifier.predict(X_test)

print(f'Accuracy on test set {classifier.score(X_test, Y_test)}')


Accuracy on training set 0.9979
Accuracy on test set 0.997
