## 3.1 Na&iuml;ve Bayes from Scratch

In [10]:
import csv

Load data.

In [8]:
def load_csv(fn):  
    lines = csv.reader(open(fn))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [9]:
fn = 'pima-indians-diabetes.data.csv'
dataset = load_csv(fn)
print('rows', len(dataset))

rows 768


Split data into train/test

In [11]:
import random

In [27]:
def split_database(dataset, ratio):
    train_size = round(len(dataset)*ratio)
    copy = list(dataset)
    random.shuffle(copy)
    return copy[:train_size], copy[train_size:]

In [28]:
dataset = [[1],[2],[3],[4],[5]]
ratio = 0.67
train, test = split_database(dataset,ratio)
print(f'Train: {train} , Test: {test}')

Train: [[1], [5], [2]] , Test: [[4], [3]]


### 2. Summarize data

** Separate the training dataset instances by class value ** so that we can calculate statistics for each class.<br\>
dict: each class value -> list of instances that belong to that class<br\>

In [35]:
def separate_by_class(dataset):
    sep = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if(vector[-1] not in sep):
            sep[vector[-1]] = []
        sep[vector[-1]].append(vector)
    return sep    

In [37]:
dataset = [[1,20,1],[2,21,0],[3,22,1],[4,22,0]]
sep = separate_by_class(dataset)
sep

{1: [[1, 20, 1], [3, 22, 1]], 0: [[2, 21, 0], [4, 22, 0]]}

** Summarize ** <br\>
<mark>zip</mark> groups the values for each attribute across our data instances so that we can compute the mean and standard deviation values for the attribute.

In [38]:
import numpy as np

In [39]:
def summarize(dataset):
    summarize  = [(np.mean(attr), np.std(attr)) for attr in zip(*dataset)]
    del summarize[-1]
    return summarize

In [40]:
dataset = [[1,20,0], [2,21,1], [3,22,0], [4,22,0]]
summary = summarize(dataset)

In [41]:
print(f'Attribute summaries: {summary}')

Attribute summaries: [(2.5, 1.118033988749895), (21.25, 0.82915619758885)]


** Summarize attributes by class **

In [50]:
def summarize_by_class(dataset):
    sep = separate_by_class(dataset)
    summaries = {}
    for class_val, instances in sep.items():
        summaries[class_val] = summarize(instances)
    return summaries   

In [51]:
dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarize_by_class(dataset)
print(summary)

{1: [(2.0, 1.0), (21.0, 1.0)], 0: [(3.0, 1.0), (21.5, 0.5)]}


### 3. Predict
Calculate the probability that a given data instance belongs to each class, then select the class with the largest probability.

** Calculate Gaussian probability density function **

In [52]:
import math

In [53]:
def gaussian_prob(x, mean, std):
    exp = math.exp((-(math.pow(x-mean,2)))/(2*math.pow(std,2)))
    return (1/(math.sqrt(2*math.pi)*std))*exp

In [55]:
x = 71.5
mean= 73
std = 6.2
prob = gaussian_prob(x,mean,std)
print(f'Probability of belonging to this class: {prob}')

Probability of belonging to this class: 0.06248965759370005


In [56]:
summaries = {0:[(1, 0.5), (2, 3.5)], 1:[(20, 5.0), (17, 1.5)]}
instance = [1.1, 10]

** Calculate class probabilities ** <br\>
Multiply the attribute probabilities for each class to get the class probability of a given data instance.

In [57]:
def class_prob(summaries, instance):
    probs = {}
    for class_val, class_summ in summaries.items():
        probs[class_val] = 1
        for i in range(len(class_summ)):
            mean, std = class_summ[i]
            probs[class_val] *= gaussian_prob(instance[i], mean, std)
    return probs        

In [58]:
probs = class_prob(summaries, instance)
print(f'Probabilities for each class: {probs}')

Probabilities for each class: {0: 0.006540525583425349, 1: 3.126711940345906e-10}


In [60]:
def predict(summaries, instance):
    probs = class_prob(summaries, instance)
    best_label, best_prob = None, -1
    for class_val, prob in probs.items():
        if best_label is None or prob > best_prob:
            best_prob = prob
            best_label = class_val
    return best_label        

In [61]:
summaries = {0:[(1, 0.5), (2, 3.5)], 1:[(20, 5.0), (17, 1.5)]}
instance = [1.1, 10]
result = predict(summaries, instance)
print(f'Prediction: {result}')

Prediction: 0


** Make predictions for test set ** <br\>
Return a list of predictions for each test instance.

In [64]:
def get_predictions(summaries, test):
    preds = []
    for x in range(len(test)):
        result = predict(summaries, test[x])
        preds.append(result)
    return preds       

In [65]:
summaries = {'A':[(1, 0.5), (2, 3.5)], 'B':[(20, 5.0), (17, 1.5)]}
test = [[1.1, 5.0], [19.1, 18.3], [18.0, 2.0]]
preds = get_predictions(summaries, test)
print(f'Predictions: {preds}')

Predictions: ['A', 'B', 'B']


### 4. Evaluate

In [71]:
def accuracy(test, preds):
    cor = 0
    for x in range(len(test)):
        if test[x][-1] == preds[x]:
            cor += 1
    return cor / len(test)    

In [72]:
test = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]  # Ground truth
preds = ['a', 'a', 'a']
acc = accuracy(test, preds)
print(f'Accuracy: {acc}')

Accuracy: 0.6666666666666666


### Work with real dataset!