In [1]:
# use following link for tutorial
# https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
# 
# note: class == diabetes / no diabetes, attribute == different numerical variables
#     --> need to calculate avg / std of each attribute for each class


In [91]:
# 1. Handle Data

#load Pima indians dataset
import pandas as pd 

filename = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
dataset = pd.read_csv(filename);
dataset = dataset.values.tolist()

print(f'Loaded data file {filename} with {len(dataset)} rows');


Loaded data file https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv with 767 rows


In [92]:
import random

"""
Function name: splitData
    Purpose: split data into test and train sets randomly
    inputs: dataset, splitRatio (number between 0 and 1, ratio of train data)
    output: nested list of split data (train and test)
"""
def splitData(dataset, splitRatio):
    trainSize = int(len(dataset)*splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

# test function
dataset = [[1], [2], [3], [4], [5]]
splitRatio = 0.67
train, test = splitData(dataset, splitRatio)
print(f'Split {len(dataset)} into train with {train} and test with {test}')



Split 5 into train with [[3], [4], [1]] and test with [[2], [5]]


In [93]:
# 2. Summarize Data

# Separate data by class
""" 
Function name: separateByClass
    input: dataset given as a nested list
    output: dictionary where 1: train set, 0: test set
"""
def separateByClass(dataset):
    separated = {} # of type list
    for i in range(len(dataset)):
        vector = dataset[i]
        
        # assume that last column contains class labels
        # create new dictionary label if you haven't encountered the class yet
        if (vector[-1] not in separated): 
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

# test function
dataset = [[1,20,1], [2,21,0], [3,22,1]]
separated = separateByClass(dataset)
print(f'Separated instances: {separated}')

Separated instances: {1: [[1, 20, 1], [3, 22, 1]], 0: [[2, 21, 0]]}


In [94]:
# Calculate mean (use for gaussian distribution when calculating probabilities)
import math

"""
Function name: mean
    input: list of numbers
    output: average values of inputs
"""
def mean(numbers):
    return sum(numbers)/float(len(numbers))

"""
Function name: stdev
    input: list of numbers
    output: standard deviation of inputs
"""
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

# test functions
numbers = [1,2,3,4,5]
print(f'Summary of {numbers}: mean={mean(numbers)}, stdev={stdev(numbers)}')
    

Summary of [1, 2, 3, 4, 5]: mean=3.0, stdev=1.5811388300841898


In [95]:
# Summarize dataset

"""
Function name: summarize
    input: dataset (nested list)
    output: mean, standard deviation for each element in nested list (each class)
"""
def summarize(dataset):
    # zip function groups values for each attribute into their own lists
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1] # don't need mean / stdev of class labels
    return summaries

# test function
dataset = [[1,20,0], [2,21,1], [3,22,0]]
summary = summarize(dataset)
print(f'Attribute summaries: {summary}')


Attribute summaries: [(2.0, 1.0), (21.0, 1.0)]


In [96]:
# Summarize attributes by class

"""
Function name: summarizeByClass 
    input: dataset (nested list)
    output: dictionary w/ stats for each attribute separated by class labels
"""
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {} # empty dictionary
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

# test function
dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarizeByClass(dataset)
print(f'Summary by class value: {summary}')


Summary by class value: {1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)], 0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)]}


In [43]:
# 3. Make Prediction

# Calculate gaussian probability density functions
import math

"""
Function name: calculateProbability
    input: testing value, mean of input distribution, stdev of input distribution
    output: probability that testing value falls in given distribution
"""
def calculateProbability(x,mean,stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev)) * exponent

# test function
x = 71.5 # test point
mean = 73
stdev = 6.2
probability = calculateProbability(x,mean,stdev)
print(f'Probability of belonging to this class: {probability}')


Probability of belonging to this class: 0.06248965759370005


In [53]:
# Calculate class probabilities 

"""
Function name: calculateClassProbabilities
    input: stats for each class in dictionary 
    output: probability of data point belonging to multiple classes
"""
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {} # empty dictionary
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1 # initialize
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            
            # multiply individual prob to get prob of entire data instance
            probabilities[classValue] *= calculateProbability(x,mean,stdev)
    
    return probabilities

# test function
summaries = {0:[(1,0.5)], 1:[(20, 5.0)]}
inputVector = [1.1,'?']
probabilities = calculateClassProbabilities(summaries, inputVector)
print(f'Probabilities for each class: {probabilities}')


Probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150442e-05}


In [54]:
# Make a prediction

"""
Function name: predict
    input: stats for each class in dictionary, input value 
    output: predicted class based on max probability of any individual class 
"""
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1 # initialize vars
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

# test function
summaries = {'A': [(1,0.5)], 'B':[(20,5.0)]}
inputVector = [1.1,'?']
result = predict(summaries, inputVector)
print(f'Prediction: {result}')


Prediction: A


In [55]:
# 4. Make Predictions

"""
Function name: getPrediction 
    input: stats for each class in dictionary, list of input values (nested list)
    output: class prediction list
"""
def getPredictions(summaries, testSet):
    predictions = [] # empty list
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

# test function
summaries = {'A': [(1,0.5)], 'B':[(20,5.0)]}
testSet = [[1.1,'?'],[19.1,'?']]
predictions = getPredictions(summaries, testSet)
print(f'Predictions: {predictions}')


Predictions: ['A', 'B']


In [56]:
# 5. Get Accuracy

"""
Function name: getAccuracy
    input: nested list of test set, list of predictions
    output: accuracy between known test labels and predictions
"""
def getAccuracy(testSet, predictions):
    correct = 0 # initialize
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

# test function
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAccuracy(testSet, predictions)
print(f'Accuracy: {accuracy}')


Accuracy: 66.66666666666666


In [106]:
# actual main function: test on pima indians dataset
filename = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
splitRatio = 0.67
dataset = pd.read_csv(filename);
dataset = dataset.values.tolist()

trainingSet, testSet = splitData(dataset, splitRatio)
print(f'Split {len(dataset)} into train={len(trainingSet)} and test={len(testSet)} rows')

# prepare model
summaries = summarizeByClass(trainingSet)

# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print(f'Accuracy: {accuracy}%')



Split 767 into train=513 and test=254 rows
Accuracy: 75.98425196850394%
