Handling of Data

In [1]:
import csv

def loadCsv (filename) :
    lines = csv.reader(open(filename, "rb"))
    dataset = list(lines)
    for i in range (len(dataset)) :
        dataset[i] = [float(x) for x in dataset[i]]
    
    return dataset

In [2]:
filename = 'pima-indians-diabetes-data.csv'

dataset = loadCsv (filename)

In [4]:
import random

def splitDataset (dataset, splitRatio) :
    trainSize = int(len(dataset)*splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet)<trainSize :
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    
    return [trainSet, copy]
# copy contains the remaining data points

Summarization of Data  
The summary of the training data collected involves the mean and the standard deviation for each attribute, by class value.  
We can break the preparation of this summary data down into the following sub-tasks:

1. Separate Data By Class
2. Calculate Mean
3. Calculate Standard Deviation
4. Summarize Dataset
5. Summarize Attributes By Class

The first task is to separate the training dataset instances by class value so that we can calculate statistics for each class. We can do that by creating a map of each class value to a list of instances that belong to that class and sort the entire dataset of instances into the appropriate lists.

In [5]:
def separateByClass (dataset) :
    separated = {}
    for i in range(len(dataset)) :
        vector = dataset[i]
        if vector[-1] not in separated :
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [6]:
import math

def mean (numbers) :
    return sum(numbers)/float(len(numbers))

def stdev (numbers) :
    avg = mean(numbers)
    variance = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers))
    return math.sqrt(variance)

In [7]:
def summarize (dataset) :
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [8]:
def summarizeByClass (dataset) :
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.iteritems() :
        summaries[classValue] = summarize(instances)
    return summaries

Making Predictions  
  
We can divide this part into the following tasks:

1. Calculate Gaussian Probability Density Function
2. Calculate Class Probabilities
3. Make a Prediction
4. Estimate Accuracy

In [32]:
def calculateGaussianProbability (x, mean, stdev) :
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [43]:
def calculateGaussianLogProbability (x, mean, stdev) :
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return math.log((1 / (math.sqrt(2*math.pi) * stdev)) * exponent)

In [54]:
def calculateClassProbabilities (summaries, inputVector, logProbability=False) :
    probabilities = {}
    for classValue, classSummaries in summaries.iteritems() :
        probabilities[classValue] = 1
        for i in range(len(classSummaries)) :
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            if logProbability==False :
                probabilities[classValue] *= calculateGaussianProbability(x, mean, stdev)
            else :
                probabilities[classValue] += calculateGaussianLogProbability(x, mean, stdev)
    if logProbability==False :
        return probabilities
    else :
        for classValue, logOfProbability in probabilities.iteritems() :
            probabilities[classValue] = math.exp(logOfProbability)
        return probabilities

In [55]:
def predict(summaries, inputVector, logProbability=False) :
    probabilities = calculateClassProbabilities(summaries, inputVector, logProbability)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.iteritems() :
        if bestLabel is None or probability > bestProb :
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [56]:
def getPredictions (summaries, testSet, logProbability=False) :
    predictions = []
    for i in range(len(testSet)) :
        result = predict(summaries, testSet[i], logProbability)
        predictions.append(result)
    return predictions

In [57]:
def getAccuracy(testSet, predictions) :
    correct = 0
    for i in range(len(testSet)) :
        correct += (testSet[i][-1]==predictions[i])
    return correct/float(len(testSet))*100

In [60]:
splitRatio = 0.65
trainingSet, testSet = splitDataset(dataset, splitRatio)

summaries = summarizeByClass(trainingSet)

predictions = getPredictions(summaries, testSet, logProbability=True)
accuracy = getAccuracy(testSet, predictions)

print "Accuracy of Naive Bayes Classifier is %f" %accuracy

Accuracy of Naive Bayes Classifier is 73.234201
