# Naive bayes is based on bayes theorem: 
<br> <center>
    <b> p(a|b) = (p(b|a)*p(a))/p(b) </b>
</br> </center> <br>
In our case a will be target label and b is feature vector. We will assume all feature are mutually independent. <br>In our dataset [Source: https://www.kaggle.com/msjaiclub/2classclassification?select=ex2data1.csv] a person being healthy or not is our target and features are person-go-for-walk and person-eat-healthy-food so both are independent but both contribute in target.

using that independent assumption now we can divide probability for every feature by converting <br> <center>
    <b>p(a|b) = { (p(b1|a)*p(b2|a)*p(b3|a)....*p(bn|a)) * p(a) } / p(b) </b>
    </br> </center>
<br>
where p(a|b) is conditional probability <br>
p(b1|a) is class posterior probability <br>
p(a) = prior prob of a <br>
p(b) = prior prob of b <br>

Now we will select class which is highest so we will apply <br> <center> 
    <b>argmax a {p(a|b)} <br></b> or <b><br> argmax a { (p(b1|a)*p(b2|a)*p(b3|a)....*p(bn|a)) * p(a) } / p(b) </b> <br> </center> <br>
and we will take log of this multiplication because this all term is between 0 to 1 can cause overflow so convert it by taking log.

So final formula becomes <b> <br> <center> argmax a {log(p(b1|a)) + .... + log(p(bn|a) + log(p(a)) } </center> <br>  </b>
where prior prob p(a) is just frequency <br>
and posterior p(bi|a) for all i = 1...n features follows gaussian kernel.




In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


class NaiveBayesClassifier:

    def fittodata(self, data, target):
        samples, features = data.shape       
        
        # diffrent classes in target array
        self._classes = np.unique(target)    
        classes = len(self._classes)        
        
        # calculate mean, var, and prior prob for each class.
        self._mean = np.zeros((classes, features), dtype=np.float64)  
        self._var = np.zeros((classes, features), dtype=np.float64)
        self._priors =  np.zeros(classes, dtype=np.float64)    

        # doing for every unique class
        for i, c in enumerate(self._classes):  
            # we select all data with one class, ex- all data with target class 0
            data_c = data[target==c]  
            
            # we calculate mean and var of each feature column in data_c ie
            # the restricted set when a particular class c is selected.
            # We want to determine p(b=x|a=c) = p(b0=x0|a=c)*...*p(bn=xn|a=c)
            
            # column-mean for different features (and restricted to class c) stored in array  
            self._mean[i, :] = data_c.mean(axis=0)  
            # variance array for different features (and restricted to class c) stored in array 
            self._var[i, :] = data_c.var(axis=0)
            
            # above 2 help us determine the marginal distributions of each feature
            # ie array [pdf(b0|a=c), pdf(b1|a=c)] for this particular 2 feature dataset.
            
            # prior probability of observing class c with class index i ie p(a==c)
            self._priors[i] = data_c.shape[0] / float(samples) 
        
        # Side note: self._mean[0][1] signifies mean for the 2nd-feature for target class index 0     
        
    def predictdata(self, data):   
        #PREDICT FOR WHOLE SAMPLE
        target_pred = [self._predictdata(x) for x in data]
        return np.array(target_pred)      
        
    def _predictdata(self, x):  
        posteriors = []
        # for each datasample x find seprately the probability of x belonging to each class in self._classes 
        # take the max of these which maximizes p(a|b) where a is possible class assigned to x. 
        # rather than addition we multiply as we work with logs
        
        for i, c in enumerate(self._classes): 
            # we assume that x belongs to class c out of possible self._classes with class index i 
            # then log  (priori-probability of x belonging to class index i) or p(a==c) is:
            prior = np.log(self._priors[i])
            
            # since x belongs to class c with class index i, the probability of belonging to class c
            # depends on x0 and x1 coming from joint distribution of b0 and b1 calculated as
            # gaussian distribution from data restricted to class c as constructed earlier.
            
            # _pdfdata() returns array as the probabilities of x0 belonging to b0's distribution
            # and x1 belonging to b1's distribution ie [p(b0=x0|a=c), p(b1=x1|a=c)] when apriori class
            # is known to be c.
            
            # rather than multiplying as we do in joint probabilities we do sum as logs were taken.
            # this gives posterior log probability of x coming from distribution of b when we apriori
            # know that class it belongs to is c.
            posterior = np.sum(np.log(self._pdfdata(i, x)))
            
            # below forms the log version of eqaution: 
            # (posterior)( p(b1=x1|a=c)*p(b2=x2|a=c)p(b3=x3|a=c)....*p(bn=xn|a=c) ) * (prior)p(a=c)
            posterior = posterior + prior
            posteriors.append(posterior)
            
        # return that class amongst all classes which gives highest log(p(a=c|b=x)) condititonal probability
        return self._classes[np.argmax(posteriors)]
            

    def _pdfdata(self, class_i, x): 
        mean = self._mean[class_i]
        var = self._var[class_i]
        
        # we return a probability of each feature of x if it belongs to corresponding gaussian distribution of b
        # when we know that x comes from class c with class_i as identifying class index.
        num = np.exp(- (x-mean)**2 / (2 * var))
        deno = np.sqrt(2 * np.pi * var)
        
        # below is an array! 
        return num / deno   


# Train

In [42]:
# dataset from kaggle[https://www.kaggle.com/msjaiclub/2classclassification?select=ex2data1.csv]
df = pd.read_csv('ex2data1.csv') 
data = df.drop('label',axis = 1)
data = np.asarray(data) 

In [43]:
data[0:5]

array([[34.62365962, 78.02469282],
       [30.28671077, 43.89499752],
       [35.84740877, 72.90219803],
       [60.18259939, 86.3085521 ],
       [79.03273605, 75.34437644]])

In [44]:
target = df['label'] 
target.head()

0    0
1    0
2    0
3    1
4    1
Name: label, dtype: int64

In [45]:
data_train, data_test, target_train, target_test = train_test_split(data,target,test_size = 0.2 ,random_state = 42)  # splitting in train test split with 80/20 ratio 

In [46]:
def accuracy(target_true, target_pred): 
    accuracy = np.sum(target_true == target_pred) / len(target_true)
    return accuracy

In [47]:
nb = NaiveBayesClassifier() 
nb.fittodata(data_train, target_train)

# Test

In [48]:
predictions = nb.predictdata(data_test)

In [49]:
print("Naive Bayes classification accuracy", accuracy(target_test, predictions))

Naive Bayes classification accuracy 0.8
