In [180]:
import csv
import math
import random

In [181]:
def loadCsv(filename):
    lines = csv.reader(open(r'C:\Users\sidde\diabetes.csv'))
    headers = next(lines)
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [182]:
def splitDataSet(dataset, splitRatio, trainSet=[], testSet=[]):
    num_list = random.sample(range(0, len(dataset)), len(dataset))
    for i in range(len(dataset)):
        if (i < splitRatio * len(dataset)):
            testSet.append(dataset[i])
        else:
            trainSet.append(dataset[i])

In [183]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [184]:
def mean(listValues):
    return sum(listValues)/float(len(listValues))

In [185]:
def stdDev(listValues):
    avg = mean(listValues)
    variance = sum([pow(x - avg, 2) for x in listValues])/(float(len(listValues)) - 1)
    return math.sqrt(variance)

In [202]:
def summarize(dataset):
    summaries = [(mean(attribute), stdDev(attribute)) for attribute in zip(*dataset)]    
    del summaries[-1]
    return summaries

In [210]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue in separated:        
        summaries[classValue] = summarize(separated[classValue])
    return summaries

In [211]:
def calculateProbability(x, mean, stdDev):
    exponent = math.exp(-(math.pow(x-mean, 2))/(2*math.pow(stdDev,2)))    
    return (exponent/(math.sqrt(2*math.pi)*stdDev))

In [217]:
def calculateClassProbability(summaries, inputVector):
    probabilities = {}
    print(summaries.items())
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdDev = classSummaries[i]            
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdDev)
    return probabilities

In [218]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbability(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel    

In [219]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [220]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions [x]:
            correct += 1
    return (correct/float(len(testSet))) * 100

In [221]:
def main():
    fileName = r'C:\Users\sidde\diabetes.csv'
    splitRatio = 0.67
    dataset = loadCsv(fileName)
    trainSet = []
    testSet = []
    splitDataSet(dataset, splitRatio, trainSet, testSet)
    print('Split {0} rows to trainSet = {1} and testSet = {2}.'.format(len(dataset), len(trainSet), len(testSet)))
    summaries = summarizeByClass(trainSet)
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy = {0}%'.format(accuracy))
    
main()

Split 768 rows to trainSet = 253 and testSet = 515.
[[3.0, 163.0, 70.0, 18.0, 105.0, 31.6, 0.268, 28.0, 1.0], [9.0, 145.0, 88.0, 34.0, 165.0, 30.3, 0.771, 53.0, 1.0], [9.0, 130.0, 70.0, 0.0, 0.0, 34.2, 0.652, 45.0, 1.0], [4.0, 132.0, 0.0, 0.0, 0.0, 32.9, 0.302, 23.0, 1.0], [3.0, 129.0, 92.0, 49.0, 155.0, 36.4, 0.968, 32.0, 1.0], [8.0, 100.0, 74.0, 40.0, 215.0, 39.4, 0.661, 43.0, 1.0], [3.0, 128.0, 72.0, 25.0, 190.0, 32.4, 0.549, 27.0, 1.0], [10.0, 90.0, 85.0, 32.0, 0.0, 34.9, 0.825, 56.0, 1.0], [8.0, 186.0, 90.0, 35.0, 225.0, 34.5, 0.423, 37.0, 1.0], [5.0, 187.0, 76.0, 27.0, 207.0, 43.6, 1.034, 53.0, 1.0], [6.0, 125.0, 76.0, 0.0, 0.0, 33.8, 0.121, 54.0, 1.0], [0.0, 198.0, 66.0, 32.0, 274.0, 41.3, 0.502, 28.0, 1.0], [0.0, 121.0, 66.0, 30.0, 165.0, 34.3, 0.203, 33.0, 1.0], [2.0, 118.0, 80.0, 0.0, 0.0, 42.9, 0.693, 21.0, 1.0], [2.0, 197.0, 70.0, 99.0, 0.0, 34.7, 0.575, 62.0, 1.0], [0.0, 151.0, 90.0, 46.0, 0.0, 42.1, 0.371, 21.0, 1.0], [8.0, 124.0, 76.0, 24.0, 600.0, 28.7, 0.687, 52.0, 1.0

dict_items([(1.0, [(5.0120481927710845, 3.6575814210086515), (144.86746987951807, 29.462572820087523), (73.02409638554217, 20.034711282814104), (22.771084337349397, 18.87011141746214), (98.28915662650603, 136.91723778596545), (34.74939759036147, 6.829855076382249), (0.5176987951807227, 0.32344998991210455), (38.68674698795181, 11.55783728486852)]), (0.0, [(3.323529411764706, 3.1333346910459587), (109.58235294117647, 23.04784668975517), (68.1470588235294, 19.223894698862896), (19.341176470588234, 15.260695495621881), (70.81176470588235, 90.1616554923862), (30.834117647058825, 7.277802076382699), (0.41222352941176504, 0.2903875955617054), (30.870588235294118, 11.272092821433326)])])
dict_items([(1.0, [(5.0120481927710845, 3.6575814210086515), (144.86746987951807, 29.462572820087523), (73.02409638554217, 20.034711282814104), (22.771084337349397, 18.87011141746214), (98.28915662650603, 136.91723778596545), (34.74939759036147, 6.829855076382249), (0.5176987951807227, 0.32344998991210455), (