## Naive Bayes implementation in Python from scratch

In [69]:
# Naive Bayes uses Bayes Theorem but with the assumption that the columns in the dataset are independent

# Bayes Theorem:

# P(class|data) = P(data|class) * P(class) / P(data)

# We will go over a few steps to implement the above idea:

# We wil separate the dataset by class, that is, we will group the rows having the same class
# We will calculate the mean and standard deviation for each column and each class
# we will do a gaussian probability function, that computes the probability of a new value being taken from
# a gaussian distrinution with a certain mean and standard deviation
# We will then predict for a new row the probability of being in each class, by calculating the probability of
# each class and the probability of the features of the new row being taken from a gaussian distribution with the
# mean and standard deviation of the columns of that class in the original dataset

In [70]:
# function to separate by class

def separate(dataset):
    
    my_dict = {}
    
    for row in dataset:
        
        if row[-1] not in my_dict:
            my_dict[row[-1]]=[]
        
        my_dict[row[-1]].append(row)
    
    return my_dict

In [71]:
# Test separating data by class

dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]

separated = separate(dataset)

for label in separated:
    print(label)
    for row in separated[label]:
        print(row)

0
[3.393533211, 2.331273381, 0]
[3.110073483, 1.781539638, 0]
[1.343808831, 3.368360954, 0]
[3.582294042, 4.67917911, 0]
[2.280362439, 2.866990263, 0]
1
[7.423436942, 4.696522875, 1]
[5.745051997, 3.533989803, 1]
[9.172168622, 2.511101045, 1]
[7.792783481, 3.424088941, 1]
[7.939820817, 0.791637231, 1]


In [72]:
# function to get the mean of a column

def mean(column):
    
    return sum(column)/len(column)

In [73]:
# function to calculate the standard deviation of a column

def std_dev(column):
    
    avg = mean(column)
    
    std = 0
    
    for x in column:
        std = std + (x-avg)**2
    
    std = std/float((len(column)-1))
    
    std = std**(1/2)
    
    return std

In [74]:
# function to get the above statistics for a whole dataset

def summarize(dataset):
    
    summaries = []
    
    for column in zip(*dataset):
        summaries.append((mean(column), std_dev(column), len(column)))
    
    del(summaries[-1])
        
    return summaries

In [75]:
summary = summarize(dataset)
print(summary)

[(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]


In [76]:
# function to get the statistics for each class

def summarize_by_class(dataset):
    
    my_dict = separate(dataset)
    
    summaries = {}
    
    for class_value, rows in my_dict.items():
        summaries[class_value] = summarize(rows)
    
    return summaries

In [77]:
summary = summarize_by_class(dataset)
for label in summary:
    print(label)
    for row in summary[label]:
        print(row)

0
(2.7420144012, 0.9265683289298018, 5)
(3.0054686692, 1.1073295894898725, 5)
1
(7.6146523718, 1.2344321550313704, 5)
(2.9914679790000003, 1.4541931384601618, 5)


In [78]:
# function for a normal distribution
import math

def gaussian(x,mean,std_dev):
    
    exponent = math.exp(-((x-mean)**2 / (2 * std_dev**2 )))
    
    return (1 / (math.sqrt(2 * math.pi) * std_dev)) * exponent

In [79]:
# Test Gaussian PDF
print(gaussian(1.0, 1.0, 1.0))
print(gaussian(2.0, 1.0, 1.0))
print(gaussian(0.0, 1.0, 1.0))

0.3989422804014327
0.24197072451914337
0.24197072451914337


In [80]:
# We expected to see that x = 2 and x = 0 have the same probability, as they are at equal distance around the mean x = 1
# Also, the mean x = 1 has the highest probability, as in a gaussian distribution the mean has the highest probability

In [81]:
# function to get the probability for a new x

# We will use the formula:
# P(class|data) = P(class)*P(X1|class)*P(X2|class)*...

# where we find P(Xi|class) with the gaussian function

def naive_bayes(new_row, dataset):
    
    summaries = summarize_by_class(dataset)
    
    total_train = 0
    
    for i in summaries.keys():
        total_train = total_train + summaries[i][0][2]
    
    probabilities = {}
    
    for i in summaries.keys():
        
        probabilities[i] =summaries[i][0][2]/total_train # this is P(class)
        
        for j in range(len(summaries[i])):
            
            probabilities[i] = probabilities[i]*gaussian(new_row[j],summaries[i][j][0],summaries[i][j][1])
    
    return probabilities


In [82]:
# Test with the first row in the dataset

probabilities = naive_bayes(dataset[0], dataset)
print(probabilities)

{0: 0.05032427673372076, 1: 0.00011557718379945765}


In [None]:
# We see that the probability of the first row having class 0 is much greater than of having class 1, as expected