**Aim:** Write a program to implement the Naïve Bayesian classifier for the appropriate dataset and compute
the performance measures of the model.


**Theory:** It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature.

For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple and that is why it is known as ‘Naive’.

Naive Bayes model is easy to build and particularly useful for very large data sets. Along with simplicity, Naive Bayes is known to outperform even highly sophisticated classification methods.

**Code:**

Dataset Used is **Iris Dataset**<br>
Importing Libraries

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
from math import exp
from math import pi
import random
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

In [2]:
df=pd.read_csv('datasets_19_420_Iris.csv')
df=df.drop('Id',axis=1)

df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
le=LabelEncoder()
df['Species']=le.fit_transform(df['Species'])


# Naive Bayes Implementation

In [11]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [12]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [21]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [14]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [15]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [46]:
def calculate_probability(x, mean, stdev):
    exponent = exp(-1*((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [20]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, count = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [24]:
df=df.to_numpy()
df

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [4.8, 3. , 1.4, 0.1, 0. ],
       [4.3, 3. , 1.1, 0.1, 0. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [5.7, 3.8, 1.7, 0.3, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [5.4, 3.4, 1.7, 0.2, 0. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [5.1, 3.3, 1.7, 0.5, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5. , 3. , 1.6, 0.2, 0. ],
       [5. , 3.4, 1.6, 0.4, 0. ],
       [5.2, 3.5, 1.5, 0.2, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.7, 3



def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
		print('[%s] => %d' % (value, i))
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

In [32]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

150

In [29]:

str_column_to_int(df, len(df[0])-1)

[0.0] => 0
[1.0] => 1
[2.0] => 2


{0.0: 0, 1.0: 1, 2.0: 2}

In [38]:
np.random.shuffle(df)
model = summarize_by_class(df[:120])
model

{0.0: [(5.0, 0.33638949855315353, 39),
  (3.38974358974359, 0.3676345953273596, 39),
  (1.4717948717948717, 0.1685053770010696, 39),
  (0.23333333333333336, 0.0982299486257503, 39)],
 2.0: [(6.552380952380952, 0.6302295730527808, 42),
  (2.976190476190476, 0.3304169751511874, 42),
  (5.492857142857143, 0.5167406563483707, 42),
  (2.0261904761904757, 0.28203054998811206, 42)],
 1.0: [(6.012820512820511, 0.4985133093189701, 39),
  (2.7923076923076926, 0.31820600360261114, 39),
  (4.338461538461539, 0.41014463727053835, 39),
  (1.335897435897436, 0.18706483464743828, 39)]}

In [49]:
label=[]
actual=[]
for i in range(120,150):
    actual.append(df[i][-1])
    label.append(predict(model,df[i])) 

In [55]:
correct=0
for i in range(len(label)):
    if label[i]==actual[i]:
        correct += 1
print('Accuracy is {}'.format(correct/len(label)* 1.0))        

Accuracy is 0.9666666666666667


In [58]:
from sklearn import metrics
print(metrics.confusion_matrix(actual,label))

# Print the precision and recall, among other metrics
print(metrics.classification_report(actual,label, digits=3))

[[11  0  0]
 [ 0 10  1]
 [ 0  0  8]]
              precision    recall  f1-score   support

         0.0      1.000     1.000     1.000        11
         1.0      1.000     0.909     0.952        11
         2.0      0.889     1.000     0.941         8

    accuracy                          0.967        30
   macro avg      0.963     0.970     0.965        30
weighted avg      0.970     0.967     0.967        30

