# Naïve Bayesian classifier

Write a program to implement the naïve Bayesian classifier for a sample training data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.

In [134]:
import pandas as pd
import math
import numpy as np

In [135]:
# load the data
df = pd.read_csv("pima-indians-diabetes.csv",header=None)

In [136]:
# shuffle and convert to arrays of float type values
data = df.sample(frac=1).astype(float).values
data

array([[  0.   , 135.   ,  68.   , ...,   0.365,  24.   ,   1.   ],
       [  6.   , 162.   ,  62.   , ...,   0.178,  50.   ,   1.   ],
       [  4.   , 146.   ,  78.   , ...,   0.52 ,  67.   ,   1.   ],
       ...,
       [  2.   ,  71.   ,  70.   , ...,   0.586,  22.   ,   0.   ],
       [  1.   , 139.   ,  46.   , ...,   0.654,  22.   ,   0.   ],
       [ 10.   , 139.   ,  80.   , ...,   1.441,  57.   ,   0.   ]])

In [137]:
# split data based on a ratio
split_ratio = 0.8
train = data[:int(len(data)*split_ratio),:] 
test = data[int(len(data)*split_ratio)+1:,:]

In [138]:
len(train)

614

In [139]:
len(test)

153

In [140]:
# Returns the uniques values in the data and their count,we assume the last column to be the target class i.e- 0/1
print(np.unique(train[:,-1],return_counts=True))
print(np.unique(test[:,-1],return_counts=True))

(array([0., 1.]), array([402, 212]))
(array([0., 1.]), array([98, 55]))


In [141]:
# Summerize by class

In [142]:
# First seperate by class
def seperateByClass(data):
    seperated = {}
    # loop over each row
    for i in range(len(data)):
        # If no such class exists , create one
        if data[i][-1] not in seperated.keys():
            seperated[data[i][-1]]=[]
            
        # append the data sample to that class group
        seperated[data[i][-1]].append(data[i])
    
    return seperated

In [143]:
# call seperate by class func
seperatedData = seperateByClass(train)
seperatedData.keys()

dict_keys([1.0, 0.0])

In [144]:
# create summaries for each class using the seperated values
summaries = {}

# to do that we need the below summarize function

# helper function to calc mean of a column
def mean(data):
    return sum(data)/float(len(data))

# helper function to calc std dev of a column
def std_dev(data):
    # std dev is sqrt of variance 
    # variance = sum over all values from i=1 to n :  (( x_i - x_mean )^2 ) / ( n - 1 )
    x_mean = mean(data)
    variance =  sum([pow(x - x_mean,2) for x in data]) # Numerator
    variance = variance / float(len(data)-1) # Denominator
    return math.sqrt(variance)

def summarize(data):
    # here we calc and store the mean and std dev of each data sample
    summary = []
    # Calculate mean and std_dev for each attribute column, zip(*data) unpacks list of rows (data),as a list of cols
    for attrb_col in zip(*data):
        summary.append([mean(attrb_col),std_dev(attrb_col)])

    # remove the entries for the last column as we dont need the summary of the target column
    del summary[-1]
    
    return summary

In [145]:
# call summerize for each class and store the value in summaries
for className,rows in seperatedData.items():
    summaries[className] = summarize(rows)
    
print("All data has been summarized and store in summaries dict")

All data has been summarized and store in summaries dict


In [146]:
# Now that we have the summaries we can use them to make predictions for new attributes to predict target class

In [147]:
def getProb(a , mean , std):
    """
    1/(sqrt(2*pi)*std_dev) * e to power -((x-mean)**2 / (2 * std_dev**2) )  <-- represents a normal distribution
    """
    return (1/(math.sqrt(2*math.pi)*std)) * math.exp(-((a-mean)**2 / (2 * pow(std,2)) ))

In [148]:
def getClassProbabilities(summaries,test_attributes):
    classProbs = {}
    for classLabel,summary in summaries.items():
        # We initialize the prob of the class to be 1 , cause we will be using this var for mul op below
        classProbs[classLabel] = 1
        #loop over each col in test_attr 
        for i in range(len(summary)):
            # extract mean and std for that col from summary
            mean,std = summary[i]
            # get attribute from the test_attrs to find its conditional prob
            attrib = test_attributes[i]
            # multiply classProb with condition prob of all the attributes of the row
            classProbs[classLabel] *= getProb(attrib,mean,std)
    #print("Class prob = ",classProbs)
    return classProbs

In [149]:
# below func can predict the target class label given the attributes
def predict(summaries,test_attributes):
    # we will have the first get the probability of each class given the test_attrib
    probs = getClassProbabilities(summaries,test_attributes)
    #print(probs)
    best_class = None
    best_prob  = -1
    # the label with the higher prob will be assigned 
    for cls,prob in probs.items():
        if best_class is None or prob > best_prob:
            best_class = cls
            best_prob = prob
    
    return best_class,best_prob 

In [150]:
def eval_test(summaries,test):
    score = 0
    
    # for each row in the test data
    for t in test:
        # Predict the class 
        cls,prob = predict(summaries,t)
        print(f"test target = {t[-1]} , predicted = {cls} ")
        # Check if target == predicted and update score
        if t[-1] == cls:
            score+=1
    
    print("Score = ",score,"/",len(test))
    return score/len(test)

In [151]:
res = eval_test(summaries,test)

test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 1.0 
test target = 0.0 , predicted = 1.0 
test target = 1.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 1.0 , predicted = 1.0 
test target = 0.0 , predicted = 0.0 
test target = 1.0 , predicted = 0.0 
test target = 1.0 , predicted = 1.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 1.0 , predicted = 0.0 
test target = 0.0 , predicted = 1.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 1.0 
test target = 0.0 , predicted = 1.0 
test target = 0.0 , predicted = 0.0 
test target = 1.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 1.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
test target = 0.0 , predicted = 0.0 
t

In [152]:
print("Accuracy = ",res*100)

Accuracy =  68.62745098039215


# End