# Naive Bayes Classifier Using the Pima Diabetes Dataset

In [132]:
import numpy as np
import math
import random

In [133]:
def loadData(filename):
    """Function to load the dataset"""
    lines = np.loadtxt("pimadiabetes.txt", delimiter=" ")
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [119]:
def splitData(dataset, splitRatio):
    """
    A function to split the dataset into training and testing data
   The data is split randomly based on the splitRatio
   Data is moved to the training set randomly, whatever is left is the test set
   Return: trainset and testing set
  """
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    testSet = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(testSet))
        trainSet.append(testSet.pop(index))
    return [trainSet, testSet]

Exploratory Data Analysis (Separation by class, Calculating Mean, Std Deviation, Summarizing attributes by class)

In [120]:
def separateByClass(dataset):
    """Separates the dataset by class, to enable calculation of attributes for each class
        Assumes that the last attribute is the class value
        returns a map of class value to the list of data instances
    """
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
        return separated

In [121]:
def mean(numbers):
    """Function for calculating mean"""
    return sum(numbers)/float(len(numbers))

In [137]:
def stdev(numbers):
    """Function for calculating the Standard Deviation"""
    avg = mean(numbers)
    print("*************", numbers)
    print("Average ============== ", avg , " ===================")
    variance = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [123]:
def summarize(dataset):
    """Summarizes attributes by class
        Calculates mean and std deviation for each attribute
    """
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [124]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [125]:
def calculateProbability(x, mean, stdev):
    """Function to calculate probability"""
    exponent = math.exp(-(math.pow(x-mean, 2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent

In [126]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean,stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValues] *= calculateProbability(x, mean, stdev)
        return probabilities

# Make prediction

In [127]:
def predict(summaries, inputVector):
    """Function to make a prediction"""
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

# Accuracy of the prediction

In [128]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[1])
        predictions.append(result)
    return predictions

In [129]:
def getAccuracy(testSet, Predictions):
    """Calculates the accuracy of the prediction"""
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

# Main Function

Makes use of the functions created above

In [130]:
def main():
    filename = "pimadiabetes.txt"
    splitRatio = 0.67
    dataset = loadData(filename)
    trainSet, testSet = splitData(dataset, splitRatio)
    print("split {0} rows into train = 1 and test = {2} rows".format(len(dataset), len(trainSet), len(testSet)))
    
    # prepare the model
    summaries = summarizeByClass(trainSet)
    
    # test the model
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    
    # print the Accuracy
    print("Accuracy: {0}%".format(accuracy))

In [None]:
main()