In [69]:
import numpy as np
from math import sqrt
from math import pi
from math import exp

class NaiveBayes():
    def __init__(self, train, test):
        summary = summarizeByClass(train)
        predictions = list()
        for row in test:
            predicitons.append(predict(summary, row))
        return predictions
    
    def predict(self, summary, row):
        probabiities = calcClassProbability(summary, row)
        bestLabel, bestProb = None, -1
        for classValue, probability in probabilities.items():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = classValue
        return bestLabel
    
    def classSeparate(self, dataset):
        separated = dict()
        for i in range(len(dataset)):
            vector = dataset[i]
            classValue = vector[-1]
            if (classValue not in separated):
                separated[classValue] = list()
            separated[classValue].append(vector)
        return separated
    
    def summarizeDataset(self, dataset):
        summaries = [(mean(col), stdDev(col), len(col)) for col in zip(*dataset)]
        del(summaries[-1])
        return summaries
    
    def summarizeByClass(self, dataset):
        separated = self.classSeparate(dataset)
        summaries = dict()
        for classValue, row in separated.items():
            summaries[classValue] = self.summarizeDataset(row)
        return summaries
    
    def calcClassProbability(self, summaries, row):
        totalRows = sum([summaries[label][0][-1] for label in summaries])
        probabilities = dict()
        for classValue, classSummaries in summaries.items():
            probabilities[classValue] = summaries[classValue][0][-1] / float(totalRows)
            for i in range(len(classSummaries)):
                mean, stddev, count = classSummaries[i]
                probabilities[classValue] *= calcGaussianProbability(row[i], mean, stddev)
        return probabilities
    
def mean(data):
    return sum(data)/float(len(data))

def stdDev(data):
    avg = mean(data)
    variance = sum([(x-avg)**2 for x in data]) / float(len(data)-1)
    return sqrt(variance)

def calcGaussianProbability(x, mean, stddev):
    return (1 / (sqrt(2*pi) * stddev)) * exp(-((x-mean)**2 / (2 * stddev**2)))

In [64]:
test = NaiveBayes()

In [5]:
dataset = [[3.393533211,2.331273381,0],
[3.110073483,1.781539638,0],
[1.343808831,3.368360954,0],
[3.582294042,4.67917911,0],
[2.280362439,2.866990263,0],
[7.423436942,4.696522875,1],
[5.745051997,3.533989803,1],
[9.172168622,2.511101045,1],
[7.792783481,3.424088941,1],
[7.939820817,0.791637231,1]]

In [43]:
test_sep = test.classSeparate(dataset)
test_sep

{0: [[3.393533211, 2.331273381, 0],
  [3.110073483, 1.781539638, 0],
  [1.343808831, 3.368360954, 0],
  [3.582294042, 4.67917911, 0],
  [2.280362439, 2.866990263, 0]],
 1: [[7.423436942, 4.696522875, 1],
  [5.745051997, 3.533989803, 1],
  [9.172168622, 2.511101045, 1],
  [7.792783481, 3.424088941, 1],
  [7.939820817, 0.791637231, 1]]}

In [32]:
test_summary = test.summarizeDataset(dataset)
test_summary

[(5.178333386499999, 2.7665845055177263, 10),
 (2.9984683241, 1.218556343617447, 10)]

In [56]:
test_summary = test.summarizeByClass(dataset)
test_summary[0][0][-1]

5

In [50]:
print(calculateGaussianProbability(1.0,1.0,1.0))
print(calculateGaussianProbability(2.0,1.0,1.0))
print(calculateGaussianProbability(0.0,1.0,1.0))

0.3989422804014327
0.24197072451914337
0.24197072451914337


In [66]:
probabilities = test.calcClassProbability(test_summary, dataset[0])
probabilities

{0: 0.05032427673372075, 1: 0.00011557718379945765}