# Naive Bayes Classifier Using the Pima Diabetes Dataset

In [1]:
import numpy as np
import math
import random

In [8]:
def loadData(filename):
    """Function to load the dataset"""
    lines = np.genfromtxt(filename)
    dataset = list(lines)
    print(len(dataset))
    print(dataset)
    return dataset
    

In [23]:
def splitData(dataset, splitRatio):
    """
    A function to split the dataset into training and testing data
   The data is split randomly based on the splitRatio
   Data is moved to the training set randomly, whatever is left is the test set
   Return: trainset and testing set
  """
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    testSet = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(testSet))
        trainset.append(testSet.pop(index))
    return [trainSet, testSet]

Exploratory Data Analysis (Separation by class, Calculating Mean, Std Deviation, Summarizing attributes by class)

In [1]:
def separateByClass(dataset):
    """Separates the dataset by class, to enable calculation of attributes for each class
        Assumes that the last attribute is the class value
        returns a map of class value to the list of data instances
    """
    separate = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        return separated

In [2]:
def mean(numbers):
    """Function for calculating mean"""
    return sum(numbers)/float(len(numbers))

In [3]:
def stdev(numbers):
    """Function for calculating the Standard Deviation"""
    avg = mean(numbers)
    variance = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [5]:
def summarize(dataset):
    """Summarizes attributes by class
        Calculates mean and std deviation for each attribute
    """
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [6]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [9]:
def calculateProbability(x, mean, stdev):
    """Function to calculate probability"""
    exponent = math.exp(-(math.pow(x-mean, 2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent