In [1]:
import csv
import math
import random

In [2]:
def load_csv(filename):
    lines = csv.reader(open(filename, 'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [3]:
def split_dataset(dataset, ratio):
    size_of_training_set = int(len(dataset) * ratio)
    train_set = []
    test_set = list(dataset)

    while len(train_set) < size_of_training_set:
        index = random.randrange(len(test_set))
        train_set.append(test_set.pop(index))
    return [train_set, test_set]

In [4]:
def separate_by_label(dataset):
    separated = {}
    for x in range(len(dataset)):
        row = dataset[x]
        if row[-1] not in separated:
            separated[row[-1]] = []
        separated[row[-1]].append(row)
    return separated

In [5]:
def calc_mean(lst):
    return sum(lst) / float(len(lst))

In [6]:
def calc_standard_deviation(lst):
    avg = calc_mean(lst)
    variance = sum([pow(x - avg, 2) for x in lst]) / float(len(lst) - 1)
    return math.sqrt(variance)

In [7]:
def summarize_data(lst):
    summaries = [(calc_mean(attribute), calc_standard_deviation(attribute)) for attribute in zip(*lst)]
    del summaries[-1]
    return summaries

In [8]:
def summarize_by_label(data):
    separated_data = separate_by_label(data)
    summaries = {}
    for label, instances in separated_data.items():
        summaries[label] = summarize_data(instances)
    return summaries

In [9]:
def calc_probability(x, mean, standard_deviation):
    # e ^ -(y - mean)^2 / (2 * (standard deviation)^2)
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(standard_deviation, 2))))
    # ( 1 / sqrt(2π) ^ exponent
    return (1 / (math.sqrt(2 * math.pi) * standard_deviation)) * exponent

In [10]:
def calc_label_probabilities(summaries, input_vector):
    probabilities = {}
    for label, label_summaries in summaries.items():
        probabilities[label] = 1
        for i in range(len(label_summaries)):
            mean, standard_dev = label_summaries[i]
            x = input_vector[i]
            probabilities[label] *= calc_probability(x, mean, standard_dev)
    return probabilities

In [11]:
def predict(summaries, input_vector):
    probabilities = calc_label_probabilities(summaries, input_vector)
    best_label, best_prob = None, -1
    for label, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = label
    return best_label

In [12]:
def get_predictions(summaries, test_set):
    predictions = []
    for i in range(len(test_set)):
        result = predict(summaries, test_set[i])
        predictions.append(result)
    return predictions

In [13]:
def get_accuracy(test_set, predictions):
    correct = 0
    for i in range(len(test_set)):
        if test_set[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(test_set))) * 100

In [15]:
def main(filename, split_ratio):
    data = load_csv(filename)
    training_set, testing_set = split_dataset(data, split_ratio)
    print("Size of Training Set: ", len(training_set))
    print("Size of Testing Set: ", len(testing_set))
    # create model
    summaries = summarize_by_label(training_set)
    # test mode
    predictions = get_predictions(summaries, testing_set)
    accuracy = get_accuracy(testing_set, predictions)
    print('Accuracy: {0}%'.format(accuracy))
main('pima-indians-diabetes.data.csv', 0.70)

Size of Training Set:  537
Size of Testing Set:  231
Accuracy: 72.72727272727273%
