## Naive Bayes

$P(A|B) = \large{\frac{P(B|A) P(A)}{P(B)}}$
<br><br>
$Posterior = \large{\frac{\text{Likelihood * } \text{Prior}}{\text{Marginal}}}$

In [5]:
import numpy as np
import pandas as pd
from math import sqrt
from math import pi
from math import exp

### Loading Data

In [43]:
data = pd.read_csv('data.csv')

In [44]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
data.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [6]:
data.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Gaussian Probability
One assumption of Naive Bayes is that the data for each class follows Gaussian Distribution.

In [14]:
def calc_gaussian_prob(x, mean, std):
    return (1 / (sqrt(2 * pi) * std)) * (exp(-((x-mean)**2 / (2 * std**2 ))))

### Calculating probabilities for all the classes

In [15]:
def calc_class_probs(summary_by_class, data):
    total_rows = 150
    probabilities = {}
    for species, colStats in summary_by_class.items():
        probabilities[species] = summary_by_class[species][0][2] / total_rows
        for i, col in enumerate(colStats):
            mean, std, classRows = col
            probabilities[species] *= calc_gaussian_prob(data[i], mean, std)
    
    return probabilities

### Train Test Split

In [45]:
def train_test_data(data):
    test_data = pd.DataFrame()
    for species in data["Species"].unique():
        tempData = data[data["Species"] == species].sample(n=10)
        test_data = pd.concat([test_data, tempData], axis=0)
    
    return data.drop(test_data.index), test_data

### Training on the data

In [9]:
def train(data):

    summary_by_class = {}
    for label in data['Species'].unique():
        label_data = data[data['Species'] == label]
        summary_by_class[label] = [(np.mean(label_data[column]), np.std(label_data[column]), len(label_data[column])) for column in label_data.columns[:-1]]


    for species, stats in summary_by_class.items():
        print(species)
        for stat in stats:
            print(stat)
        print()
    
    return summary_by_class
    


### Prediction

In [56]:
def predict(summary_by_class, newData, verbose=True):
    probabilities = calc_class_probs(summary_by_class, newData)
    
    total = sum(probabilities.values())
    maxprob = 0
    resultSpecies = ""
    
    for species, prob in probabilities.items():
        if verbose:
            print(species, ":- ", prob/total)
        if maxprob < prob/total:
            maxprob = prob/total
            resultSpecies = species
    
    if verbose:
        print("\nPrediction:- ", resultSpecies)
        print("Probability:- ", maxprob)
    
    return resultSpecies, maxprob

### Testing the model

In [63]:
def test(summary_by_class, testData, testResults, verbose=False):
    predictedClass = []
    predictedProb = []
    accuracy = 0
    
    for data in testData:
        resultSpecies, resultProb = predict(summary_by_class, data, verbose)
        predictedClass.append(resultSpecies)
        predictedProb.append(resultProb)
    
    for i in range(len(predictedClass)):
        if predictedClass[i] == testResults[i]:
            accuracy += 1
    
    print('Accuracy:- ', (accuracy/len(testResults))*100)
    return predictedClass, predictedProb

In [13]:
summary_by_class = train(data)

Iris-setosa
(5.006, 0.3489469873777391, 50)
(3.418, 0.37719490982779713, 50)
(1.464, 0.17176728442867112, 50)
(0.244, 0.10613199329137281, 50)

Iris-versicolor
(5.936, 0.5109833656783751, 50)
(2.7700000000000005, 0.31064449134018135, 50)
(4.26, 0.4651881339845203, 50)
(1.3259999999999998, 0.19576516544063705, 50)

Iris-virginica
(6.587999999999998, 0.6294886813914926, 50)
(2.974, 0.3192553836664309, 50)
(5.5520000000000005, 0.546347874526844, 50)
(2.0260000000000002, 0.2718896835115301, 50)



In [46]:
train_data, test_data = train_test_data(data)

In [50]:
train_data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [48]:
test_data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
18,5.7,3.8,1.7,0.3,Iris-setosa
44,5.1,3.8,1.9,0.4,Iris-setosa
49,5.0,3.3,1.4,0.2,Iris-setosa
32,5.2,4.1,1.5,0.1,Iris-setosa
20,5.4,3.4,1.7,0.2,Iris-setosa


In [51]:
summary_by_class = train(train_data)

Iris-setosa
(4.9799999999999995, 0.34655446902326914, 40)
(3.3950000000000005, 0.381411850890871, 40)
(1.4425000000000001, 0.16566155257029314, 40)
(0.22999999999999998, 0.09273618495495704, 40)

Iris-versicolor
(5.8575, 0.4898405352765326, 40)
(2.7449999999999997, 0.30244834269673226, 40)
(4.2475, 0.47169243156955576, 40)
(1.33, 0.198997487421324, 40)

Iris-virginica
(6.4925000000000015, 0.5917717042914438, 40)
(2.935, 0.27345017827750634, 40)
(5.4925, 0.4981904756215238, 40)
(2.0225, 0.27247706325487286, 40)



In [64]:
testData = test_data[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]]
testResults = test_data["Species"]
predictedClass, predictedProb = test(summary_by_class, testData.values, testResults.values)

Accuracy:-  93.33333333333333
