# [Gaussian Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Gaussian_naive_Bayes)

* https://github.com/DTU-CS101/ML-TUTORIAL

In [None]:
import csv
import random
import math

In [None]:
def loadCSV(filename):
    with open(filename,"r") as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        for i in range(len(dataset)):
            dataset[i] = [float(x)for x in dataset[i]]
    return dataset    

In [None]:
dataset = loadCSV('data/pima-indians-diabetes.data')

In [None]:
len(dataset)

In [None]:
def splitDataset(dataset, split_ratio):
    '''
    function to split dataset into test data and training data 
    according to the specified split ratio
    '''
    trainsize = int(len(dataset)*(split_ratio))
    trainset = []
    copy = list(dataset)

    while len(trainset) < trainsize:
        index= random.randrange(len(copy))
        trainset.append(copy.pop(index))
    return trainset, copy

In [None]:
trainset, testset = splitDataset(dataset, 0.67)

In [None]:
len(trainset), len(testset)

In [None]:
def separateByClass(dataset):
    '''
    function to separate the passed dataset according to 
    classvalue(0 or 1)
    A dictionary with keys 0 and 1 is created, where each 
    key corrrespoondsto a list of rows of passed dataset.
    '''
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        # vector[-1] is the class value(0 or 1)
        if vector[-1] not in separated:
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated  

In [None]:
max([ testset[i][-1] for i in range(len(testset)) ])

In [None]:
separated = separateByClass(dataset)

In [None]:
attributes = [ attribute for attribute in zip(*dataset) ]

In [None]:
len(dataset[0]), len(attributes)

In [None]:
len(dataset), len(attributes[1])

In [None]:
def mean(numbers):
    '''
    function to find mean of numbers present in the passed list
    '''
    return float(sum(numbers))/len(numbers)

In [None]:
mean(attributes[0])

In [None]:
def stddev(numbers):
    '''
    function to find 'sample standard deviation' of the numbers
    in the passed list
    '''
    u = mean(numbers)
    var = float(sum([(x-u)**2 for x in numbers]))/(len(numbers) - 1) 
    return math.sqrt(var)

In [None]:
stddev(attributes[0])

In [None]:
def summarize(dataset):
    '''
    function to summarize the dataset stats.
    mean and stddev of a complete column is calculated and stored as
    a tuple in a list.
    '''
    #zip(*dataset) lets you access the data column-wise
    summaries = [(mean(attr), stddev(attr)) for attr in zip(*dataset)] 
    #deleting the stats tuple for class variabe
    del summaries[-1]
    return summaries

In [None]:
def summarizeByClass(dataset):
    '''
    function to generate summaries dictonary for 0 and 1 classvalue.
    '''
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [None]:
summaries = summarizeByClass(dataset)

In [None]:
len(summaries), len(summaries[0])

In [None]:
def calculateProbability(x, mean, stdev):
    '''
    function to calculate probability using gaussian probability
    density function
    '''
    exponent = math.exp(-((math.pow(x-mean,2))/(2*stdev**2)))
    return (1.0/(math.sqrt(math.pi*2)*stdev)*exponent)

In [None]:
calculateProbability(3.5, summaries[0][0][0], summaries[0][0][1])

In [None]:
def classProb(summaries, inputVec):
    '''
    function to generate class probabilities, i.e probaility with
    which our input set belongs to classvalue 0 or 1.
    '''
    probabilities = {}
    for classValue, classSum in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSum)):
            mean, stddev = classSum[i]
            x = inputVec[i]
            #multiplying together the attribute probabilities.
            probabilities[classValue] *= calculateProbability(x, mean, stddev)
    return probabilities        

In [None]:
[classValue for classValue, classSum in summaries.items()]

In [None]:
classProb(summaries, testset[0])

In [None]:
def predict(summaries, inputVec):
    '''
    function to predict a classvalue for the passed testcase
    here we look for the largest probability and return the 
    associated class(Label).
    '''
    prob = classProb(summaries, inputVec)
    bestLabel, bestProb = None, -1

    for classValue, probability in prob.items():
        if bestLabel is None or probability > bestProb:
            bestLabel = classValue
            bestProb = probability
    return bestLabel

In [None]:
# [i for i, val in enumerate(testset) if val[-1] == 1.0]

In [None]:
predict(summaries, testset[5])

In [None]:
def getPredictions(summaries, testset):
    '''
    function to generate predictions for our each row(instance)
    of our dataset
    '''
    predictions = []
    for i in range(len(testset)):
        result = predict(summaries, testset[i])
        predictions.append(result)
    return predictions    

In [None]:
predictions = getPredictions(summaries, testset)

In [None]:
len(predictions)

In [None]:
def getAccuracy(testset, predictions):
    '''
    function to calculate accuracy by comparing actual classvalues
    with the predicted classvalues
    '''
    correct= 0
    for i in range(len(testset)):
        if testset[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(predictions)))*100.0

In [None]:
getAccuracy(testset, predictions)